From 3a1df24272b0d42910d854c72867d89a2839c33d Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Fri, 8 May 2026 21:51:03 +0800 Subject: [PATCH 1/2] Restructure Slurm training recipes to align with standard naming convention This commit restructures the Slurm-based training recipes in the training/ directory. Key changes: - Renamed directories to follow the 7-level convention: training/{accelerator}/{model}/{framework_runtime}/{version}/{scale_params}/recipe - Standardized framework names to *-slurm. - Omitted SEQ length for NeMo recipes and cases where it was not specified in files. - Corrected GBS for Qwen3-30b based on file content (512). - Corrected GPU count for Wan based on file content (32). - Cleaned up empty directories. --- .../32gpus-fp8cs-seq8192-gbs1024/recipe}/recipe/README.md | 0 .../recipe}/recipe/launch_script.sh | 0 .../recipe}/recipe/sbatch_script.sh | 0 .../32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md | 0 .../32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh | 0 .../32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh | 0 .../32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/README.md | 0 .../32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/launch_script.sh | 0 .../32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/sbatch_script.sh | 0 .../16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md | 0 .../16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh | 0 .../16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh | 0 .../16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/README.md | 0 .../16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/launch_script.sh | 0 .../16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/sbatch_script.sh | 0 .../nemo2509/128gpus-fp8cs-gbs1024/recipe}/README.md | 0 .../nemo2509/128gpus-fp8cs-gbs1024/recipe}/submit.slurm | 0 .../nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe}/README.md | 0 .../nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe}/submit.slurm | 0 .../128gpus-bf16-seq4096-gbs4096/recipe}/recipe/README.md | 2 +- .../recipe}/recipe/launch_script.sh | 0 .../recipe}/recipe/sbatch_script.sh | 0 .../nemo2509/128gpus-fp8ds-gbs128/recipe}/README.md | 0 .../nemo2509/128gpus-fp8ds-gbs128/recipe}/submit.slurm | 0 .../nemo2509/256gpus-fp8cs-gbs1024/recipe}/README.md | 0 .../nemo2509/256gpus-fp8cs-gbs1024/recipe}/submit.slurm | 0 .../8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md | 2 +- .../8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh | 0 .../8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh | 0 .../128gpus-bf16-seq4096-gbs2048/recipe}/recipe/README.md | 2 +- .../recipe}/recipe/custom_setup_experiment.py | 0 .../recipe}/recipe/launch_script.sh | 0 .../recipe}/recipe/sbatch_script.sh | 0 .../64gpus-bf16-seq4096-gbs1024/recipe}/recipe/README.md | 2 +- .../recipe}/recipe/custom_setup_experiment.py | 0 .../64gpus-bf16-seq4096-gbs1024/recipe}/recipe/launch_script.sh | 0 .../64gpus-bf16-seq4096-gbs1024/recipe}/recipe/sbatch_script.sh | 0 .../8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/README.md | 2 +- .../8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/launch_script.sh | 0 .../8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/sbatch_script.sh | 0 .../nemo2511/32gpus-bf16-gbs64/recipe}/recipe/README.md | 2 +- .../32gpus-bf16-gbs64/recipe}/recipe/wan_14b_benchmark.sh | 0 42 files changed, 6 insertions(+), 6 deletions(-) rename training/a3ultra/{llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe}/recipe/README.md (100%) rename training/a3ultra/{llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe}/recipe/launch_script.sh (100%) rename training/a3ultra/{llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe}/recipe/sbatch_script.sh (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/README.md (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/launch_script.sh (100%) rename training/a3ultra/{llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512 => llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe}/recipe/sbatch_script.sh (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/README.md (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/launch_script.sh (100%) rename training/a3ultra/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256 => llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe}/recipe/sbatch_script.sh (100%) rename training/a4/{llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024 => llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe}/README.md (100%) rename training/a4/{llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024 => llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe}/submit.slurm (100%) rename training/a4/{llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048 => llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe}/README.md (100%) rename training/a4/{llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048 => llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe}/submit.slurm (100%) rename training/a4/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe}/recipe/README.md (95%) rename training/a4/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe}/recipe/launch_script.sh (100%) rename training/a4/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe}/recipe/sbatch_script.sh (100%) rename training/a4x/{llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128 => llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe}/README.md (100%) rename training/a4x/{llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128 => llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe}/submit.slurm (100%) rename training/a4x/{llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024 => llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe}/README.md (100%) rename training/a4x/{llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024 => llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe}/submit.slurm (100%) rename training/a4x/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/README.md (95%) rename training/a4x/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/launch_script.sh (100%) rename training/a4x/{llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128 => llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe}/recipe/sbatch_script.sh (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe}/recipe/README.md (95%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe}/recipe/custom_setup_experiment.py (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe}/recipe/launch_script.sh (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe}/recipe/sbatch_script.sh (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe}/recipe/README.md (95%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe}/recipe/custom_setup_experiment.py (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe}/recipe/launch_script.sh (100%) rename training/a4x/{qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024 => qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe}/recipe/sbatch_script.sh (100%) rename training/a4x/{qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024 => qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/README.md (95%) rename training/a4x/{qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024 => qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/launch_script.sh (100%) rename training/a4x/{qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024 => qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe}/recipe/sbatch_script.sh (100%) rename training/a4x/{wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64 => wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe}/recipe/README.md (96%) rename training/a4x/{wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64 => wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe}/recipe/wan_14b_benchmark.sh (100%) diff --git a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/README.md similarity index 100% rename from training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/README.md diff --git a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/launch_script.sh diff --git a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/README.md similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/README.md diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/launch_script.sh diff --git a/training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/README.md b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/README.md rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/launch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/launch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/sbatch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/sbatch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/README.md b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/README.md similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/README.md rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/README.md diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/launch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/launch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/launch_script.sh diff --git a/training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/sbatch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe/sbatch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/sbatch_script.sh diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md b/training/a4/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe/README.md similarity index 100% rename from training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md rename to training/a4/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe/README.md diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm b/training/a4/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe/submit.slurm similarity index 100% rename from training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm rename to training/a4/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8cs-gbs1024/recipe/submit.slurm diff --git a/training/a4/llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048/README.md b/training/a4/llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe/README.md similarity index 100% rename from training/a4/llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048/README.md rename to training/a4/llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe/README.md diff --git a/training/a4/llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048/submit.slurm b/training/a4/llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe/submit.slurm similarity index 100% rename from training/a4/llama3-1-70b/megatron-bridge-pretraining-slurm/8node-fp8-seq8192-gbs2048/submit.slurm rename to training/a4/llama3_70b/megatron-bridge-slurm/nemo2509/64gpus-fp8cs-seq8192-gbs2048/recipe/submit.slurm diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md similarity index 95% rename from training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md index b7cab4bc..999d9964 100644 --- a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md +++ b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md @@ -64,7 +64,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/launch_script.sh similarity index 100% rename from training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/launch_script.sh diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/sbatch_script.sh diff --git a/training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128/README.md b/training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe/README.md similarity index 100% rename from training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128/README.md rename to training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe/README.md diff --git a/training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128/submit.slurm b/training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe/submit.slurm similarity index 100% rename from training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS128/submit.slurm rename to training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/128gpus-fp8ds-gbs128/recipe/submit.slurm diff --git a/training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024/README.md b/training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe/README.md similarity index 100% rename from training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024/README.md rename to training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe/README.md diff --git a/training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024/submit.slurm b/training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe/submit.slurm similarity index 100% rename from training/a4x/llama3-1-405b/megatron-bridge-pretraining-slurm/32node-FP8CS-GBS1024/submit.slurm rename to training/a4x/llama31_405b/megatron-bridge-slurm/nemo2509/256gpus-fp8cs-gbs1024/recipe/submit.slurm diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/README.md b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md similarity index 95% rename from training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/README.md rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md index 1c1a5a0a..ec848465 100644 --- a/training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/README.md +++ b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/launch_script.sh b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh similarity index 100% rename from training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/launch_script.sh rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh diff --git a/training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/sbatch_script.sh b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe/sbatch_script.sh rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/README.md b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md similarity index 95% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/README.md rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md index e5f17662..7dd7cb99 100644 --- a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/README.md +++ b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md @@ -61,7 +61,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/custom_setup_experiment.py b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/custom_setup_experiment.py rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/custom_setup_experiment.py diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/launch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/launch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/launch_script.sh diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/sbatch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/sbatch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/README.md b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md similarity index 95% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/README.md rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md index d8c0972e..7db1a90c 100644 --- a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/README.md +++ b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md @@ -61,7 +61,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/custom_setup_experiment.py b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/32node-BF16-GBS2048/recipe/custom_setup_experiment.py rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/custom_setup_experiment.py diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/launch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/launch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/launch_script.sh diff --git a/training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/sbatch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS1024/recipe/sbatch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/README.md b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md similarity index 95% rename from training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/README.md rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md index bd687d83..2225895f 100644 --- a/training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/README.md +++ b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/launch_script.sh b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/launch_script.sh rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/launch_script.sh diff --git a/training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/sbatch_script.sh b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3-30b/megatron-bridge-pretraining-slurm/2node-FP8MX-GBS1024/recipe/sbatch_script.sh rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/sbatch_script.sh diff --git a/training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe/README.md b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md similarity index 96% rename from training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe/README.md rename to training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md index d33f13df..d97a1df8 100644 --- a/training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe/README.md +++ b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe/wan_14b_benchmark.sh b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/wan_14b_benchmark.sh similarity index 100% rename from training/a4x/wan2-1-14b/nemo-pretraining-slurm/8node-BF16-GBS64/recipe/wan_14b_benchmark.sh rename to training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/wan_14b_benchmark.sh From 5cf4ecbd8f09c7e0a6ec49e3671bb6c1d64d8ca5 Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Fri, 8 May 2026 23:47:30 +0800 Subject: [PATCH 2/2] Fix nested directory structure and broken references in Slurm recipes This commit removes the accidental nested recipe/recipe folder structure in all Slurm recipes (a3ultra, a4, and a4x) and updates the corresponding README files to use the correct, non-truncated RECIPE_ROOT paths. --- .../recipe => 32gpus-fp8cs-seq8192-gbs1024}/recipe/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../recipe => 32gpus-fp8cs-seq8192-gbs128}/recipe/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../recipe => 32gpus-fp8cs-seq8192-gbs512}/recipe/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../recipe => 16gpus-fp8cs-seq8192-gbs128}/recipe/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../recipe => 16gpus-fp8cs-seq8192-gbs256}/recipe/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../128gpus-bf16-seq4096-gbs4096/recipe/{recipe => }/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../8gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../128gpus-bf16-seq4096-gbs2048/recipe/{recipe => }/README.md | 2 +- .../recipe/{recipe => }/custom_setup_experiment.py | 0 .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../64gpus-bf16-seq4096-gbs1024/recipe/{recipe => }/README.md | 2 +- .../recipe/{recipe => }/custom_setup_experiment.py | 0 .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../8gpus-fp8mx-seq4096-gbs512/recipe/{recipe => }/README.md | 2 +- .../recipe/{recipe => }/launch_script.sh | 0 .../recipe/{recipe => }/sbatch_script.sh | 0 .../nemo2511/32gpus-bf16-gbs64/recipe/{recipe => }/README.md | 2 +- .../32gpus-bf16-gbs64/recipe/{recipe => }/wan_14b_benchmark.sh | 0 34 files changed, 11 insertions(+), 11 deletions(-) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/{32gpus-fp8cs-seq8192-gbs512/recipe => 32gpus-fp8cs-seq8192-gbs1024}/recipe/README.md (95%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/{recipe => }/launch_script.sh (100%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/{32gpus-fp8cs-seq8192-gbs1024/recipe => 32gpus-fp8cs-seq8192-gbs128}/recipe/README.md (95%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/launch_script.sh (100%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/{32gpus-fp8cs-seq8192-gbs128/recipe => 32gpus-fp8cs-seq8192-gbs512}/recipe/README.md (95%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/{recipe => }/launch_script.sh (100%) rename training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/{16gpus-fp8cs-seq8192-gbs256/recipe => 16gpus-fp8cs-seq8192-gbs128}/recipe/README.md (95%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/launch_script.sh (100%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/{16gpus-fp8cs-seq8192-gbs128/recipe => 16gpus-fp8cs-seq8192-gbs256}/recipe/README.md (95%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/{recipe => }/launch_script.sh (100%) rename training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/{recipe => }/README.md (97%) rename training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/{recipe => }/launch_script.sh (100%) rename training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/README.md (97%) rename training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/launch_script.sh (100%) rename training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/{recipe => }/README.md (97%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/{recipe => }/custom_setup_experiment.py (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/{recipe => }/launch_script.sh (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/{recipe => }/README.md (97%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/{recipe => }/custom_setup_experiment.py (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/{recipe => }/launch_script.sh (100%) rename training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/{recipe => }/README.md (97%) rename training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/{recipe => }/launch_script.sh (100%) rename training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/{recipe => }/sbatch_script.sh (100%) rename training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/{recipe => }/README.md (98%) rename training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/{recipe => }/wan_14b_benchmark.sh (100%) diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/README.md similarity index 95% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/README.md index 271b8552..e98270bf 100644 --- a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/README.md +++ b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3u/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS512/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/launch_script.sh diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/README.md similarity index 95% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/README.md index eef7244b..a145f7d2 100644 --- a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs1024/recipe/recipe/README.md +++ b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3u/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/README.md similarity index 95% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/README.md index 58753ca2..5a0e20e1 100644 --- a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md +++ b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3u/llama3-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS128/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/launch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/launch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/launch_script.sh diff --git a/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/sbatch_script.sh b/training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/recipe/sbatch_script.sh rename to training/a3ultra/llama3_70b/megatron-bridge-slurm/nemo2511/32gpus-fp8cs-seq8192-gbs512/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/README.md b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/README.md similarity index 95% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/README.md rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/README.md index ee14d40c..9c72e7bf 100644 --- a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/README.md +++ b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3u/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS256/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/README.md similarity index 95% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/README.md index 0617a4cb..00ac230b 100644 --- a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md +++ b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3u/llama3-8b/megatron-bridge-pretraining-slurm/2node-FP8CS-GBS128/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/launch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/launch_script.sh similarity index 100% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/launch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/launch_script.sh diff --git a/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/sbatch_script.sh b/training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/sbatch_script.sh similarity index 100% rename from training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/recipe/sbatch_script.sh rename to training/a3ultra/llama3_8b/megatron-bridge-slurm/nemo2511/16gpus-fp8cs-seq8192-gbs256/recipe/sbatch_script.sh diff --git a/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/README.md similarity index 97% rename from training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/README.md index 999d9964..0115391f 100644 --- a/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/README.md +++ b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/README.md @@ -64,7 +64,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/launch_script.sh b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/launch_script.sh similarity index 100% rename from training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/launch_script.sh rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/launch_script.sh diff --git a/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/sbatch_script.sh b/training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/sbatch_script.sh similarity index 100% rename from training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/recipe/sbatch_script.sh rename to training/a4/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs4096/recipe/sbatch_script.sh diff --git a/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/README.md similarity index 97% rename from training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/README.md index ec848465..ff0be2d4 100644 --- a/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/README.md +++ b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh similarity index 100% rename from training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/launch_script.sh rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/launch_script.sh diff --git a/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh b/training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/recipe/sbatch_script.sh rename to training/a4x/llama3_8b/megatron-bridge-slurm/nemo2511/8gpus-fp8cs-seq8192-gbs128/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/README.md similarity index 97% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/README.md index 7dd7cb99..26b68edc 100644 --- a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/README.md +++ b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/README.md @@ -61,7 +61,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/custom_setup_experiment.py b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/custom_setup_experiment.py rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/custom_setup_experiment.py diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/launch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/launch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/launch_script.sh diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/sbatch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/recipe/sbatch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/128gpus-bf16-seq4096-gbs2048/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/README.md similarity index 97% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/README.md index 7db1a90c..f82454d4 100644 --- a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/README.md +++ b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/README.md @@ -61,7 +61,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/custom_setup_experiment.py b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/custom_setup_experiment.py rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/custom_setup_experiment.py diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/launch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/launch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/launch_script.sh diff --git a/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/sbatch_script.sh b/training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/recipe/sbatch_script.sh rename to training/a4x/qwen3_235b_a22b/megatron-bridge-slurm/nemo2511/64gpus-bf16-seq4096-gbs1024/recipe/sbatch_script.sh diff --git a/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/README.md similarity index 97% rename from training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/README.md index 2225895f..39b60355 100644 --- a/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/README.md +++ b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/launch_script.sh b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/launch_script.sh similarity index 100% rename from training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/launch_script.sh rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/launch_script.sh diff --git a/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/sbatch_script.sh b/training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/sbatch_script.sh similarity index 100% rename from training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/recipe/sbatch_script.sh rename to training/a4x/qwen3_30b_a3b/megatron-bridge-slurm/nemo2511/8gpus-fp8mx-seq4096-gbs512/recipe/sbatch_script.sh diff --git a/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/README.md similarity index 98% rename from training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md rename to training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/README.md index d97a1df8..9ec90027 100644 --- a/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/README.md +++ b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/README.md @@ -65,7 +65,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/wan_14b_benchmark.sh b/training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/wan_14b_benchmark.sh similarity index 100% rename from training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/recipe/wan_14b_benchmark.sh rename to training/a4x/wan_14b/nemo-slurm/nemo2511/32gpus-bf16-gbs64/recipe/wan_14b_benchmark.sh