From 61b3f8a01596adf51ed884f10e263453619d4ad6 Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Fri, 8 May 2026 23:38:47 +0800 Subject: [PATCH] Restructure remaining GKE training recipes to align with standard naming convention This commit restructures the remaining non-compliant GKE recipes to comply with the new standardized folder and file naming convention. Key changes: - Moved specific GKE recipes in a3mega, a3ultra, and a4 to follow the standard structure. - Standardized framework names from *-pretraining-gke to *-gke. - Unified model names to maintain consistency. - Standardized scale parameter folder names where parameters were explicitly defined. - Structured paths up to the version level where parameters could not be definitively verified to avoid ambiguity. - Updated references in files to point to the new paths. --- .../nemo-gke/nemo2507/recipe}/README.md | 2 +- .../nemo-gke/nemo2507/recipe}/values.yaml | 0 .../nemo-gke-gcs/nemo2507/recipe}/README.md | 2 +- .../nemo-gke-gcs/nemo2507/recipe}/values.yaml | 0 .../nemo-gke-resiliency/nemo2507/recipe}/README.md | 2 +- .../nemo-gke-resiliency/nemo2507/recipe}/goodput-guide.md | 2 +- .../nemo-gke-resiliency/nemo2507/recipe}/ksa-setup.yaml | 0 .../nemo-gke-resiliency/nemo2507/recipe}/kueue-merge-patch.yaml | 0 .../nemo-gke-resiliency/nemo2507/recipe}/train.py | 0 .../nemo-gke-resiliency/nemo2507/recipe}/values-gcs.yaml | 0 .../nemo-gke-resiliency/nemo2507/recipe}/values-supervisor.yaml | 0 .../nemo-gke-resiliency/nemo2507/recipe}/values.yaml | 0 .../nemo-gke/nemo2507/128gpus-bf16/recipe}/README.md | 2 +- .../nemo-gke/nemo2507/128gpus-bf16/recipe}/values.yaml | 0 .../nemo-gke/nemo2507/256gpus-bf16/recipe}/README.md | 2 +- .../nemo-gke/nemo2507/256gpus-bf16/recipe}/values.yaml | 0 .../nemo-gke/nemo2507/recipe}/README.md | 2 +- .../nemo-gke/nemo2507/recipe}/values.yaml | 0 .../nemo2602/64gpus-bf16-gbs1280}/recipe/Chart.yaml | 0 .../nemo2602/64gpus-bf16-gbs1280}/recipe/README.md | 2 +- .../64gpus-bf16-gbs1280}/recipe/custom_setup_experiment.py | 0 .../nemo2602/64gpus-bf16-gbs1280}/recipe/launcher.sh | 0 .../recipe/templates/workload-config-configmap.yaml | 0 .../64gpus-bf16-gbs1280}/recipe/templates/workload-job.yaml | 0 .../recipe/templates/workload-launcher-configmap.yaml | 0 .../64gpus-bf16-gbs1280}/recipe/templates/workload-svc.yaml | 0 .../nemo2602/64gpus-bf16-gbs1280}/recipe/values.yaml | 0 .../nemo-gke-resiliency/nemo2412/recipe}/README.md | 2 +- .../nemo-gke-resiliency/nemo2412/recipe}/goodput-guide.md | 2 +- .../nemo-gke-resiliency/nemo2412/recipe}/ksa-setup.yaml | 0 .../nemo-gke-resiliency/nemo2412/recipe}/kueue-merge-patch.yaml | 0 .../nemo-gke-resiliency/nemo2412/recipe}/train.py | 0 .../nemo-gke-resiliency/nemo2412/recipe}/values-gcs.yaml | 0 .../nemo-gke-resiliency/nemo2412/recipe}/values-supervisor.yaml | 0 .../nemo-gke-resiliency/nemo2412/recipe}/values.yaml | 0 .../nemo-gke/nemo2412/recipe}/README.md | 2 +- .../nemo-gke/nemo2412/recipe}/values.yaml | 0 .../nemo-gke/nemo2407/recipe}/README.md | 2 +- .../nemo-gke/nemo2407/recipe}/values.yaml | 0 .../nemo-gke-resiliency/nemo2407/recipe}/README.md | 2 +- .../nemo-gke-resiliency/nemo2407/recipe}/goodput-guide.md | 2 +- .../nemo-gke-resiliency/nemo2407/recipe}/ksa-setup.yaml | 0 .../nemo-gke-resiliency/nemo2407/recipe}/kueue-merge-patch.yaml | 0 .../nemo-gke-resiliency/nemo2407/recipe}/train.py | 0 .../nemo-gke-resiliency/nemo2407/recipe}/values-gcs.yaml | 0 .../nemo-gke-resiliency/nemo2407/recipe}/values-supervisor.yaml | 0 .../nemo-gke-resiliency/nemo2407/recipe}/values.yaml | 0 .../nemo-gke/nemo2407/recipe}/README.md | 2 +- .../nemo-gke/nemo2407/recipe}/values.yaml | 0 .../nemo2602/16gpus-bf16-gbs1024}/recipe/Chart.yaml | 0 .../nemo2602/16gpus-bf16-gbs1024}/recipe/README.md | 2 +- .../16gpus-bf16-gbs1024}/recipe/custom_setup_experiment.py | 0 .../nemo2602/16gpus-bf16-gbs1024}/recipe/launcher.sh | 0 .../recipe/templates/workload-config-configmap.yaml | 0 .../16gpus-bf16-gbs1024}/recipe/templates/workload-job.yaml | 0 .../recipe/templates/workload-launcher-configmap.yaml | 0 .../16gpus-bf16-gbs1024}/recipe/templates/workload-svc.yaml | 0 .../nemo2602/16gpus-bf16-gbs1024}/recipe/values.yaml | 0 .../nemo2602/16gpus-fp8cs-gbs1024}/recipe/Chart.yaml | 0 .../nemo2602/16gpus-fp8cs-gbs1024}/recipe/README.md | 2 +- .../16gpus-fp8cs-gbs1024}/recipe/custom_setup_experiment.py | 0 .../nemo2602/16gpus-fp8cs-gbs1024}/recipe/launcher.sh | 0 .../recipe/templates/workload-config-configmap.yaml | 0 .../16gpus-fp8cs-gbs1024}/recipe/templates/workload-job.yaml | 0 .../recipe/templates/workload-launcher-configmap.yaml | 0 .../16gpus-fp8cs-gbs1024}/recipe/templates/workload-svc.yaml | 0 .../nemo2602/16gpus-fp8cs-gbs1024}/recipe/values.yaml | 0 .../nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe/README.md | 2 +- .../nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/Chart.yaml | 0 .../nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/README.md | 0 .../nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/launcher.sh | 0 .../{ => recipe}/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py | 0 .../{ => recipe}/templates/workload-config-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-job.yaml | 0 .../{ => recipe}/templates/workload-launcher-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-svc.yaml | 0 .../nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/values.yaml | 0 .../nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/Chart.yaml | 0 .../nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/README.md | 0 .../nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/launcher.sh | 0 .../{ => recipe}/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py | 0 .../{ => recipe}/templates/workload-config-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-job.yaml | 0 .../{ => recipe}/templates/workload-launcher-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-svc.yaml | 0 .../nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/values.yaml | 0 .../nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/Chart.yaml | 0 .../nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/README.md | 0 .../nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/launcher.sh | 0 .../{ => recipe}/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py | 0 .../{ => recipe}/templates/workload-config-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-job.yaml | 0 .../{ => recipe}/templates/workload-launcher-configmap.yaml | 0 .../recipe/{ => recipe}/templates/workload-svc.yaml | 0 .../nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/values.yaml | 0 .../nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe/README.md | 2 +- .../nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/README.md | 2 +- .../nemo-gke/nemo2507/recipe}/README.md | 2 +- .../nemo-gke/nemo2507/recipe}/values.yaml | 0 .../nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe/README.md | 2 +- .../llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256/README.md | 2 +- .../llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256/README.md | 2 +- .../llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256/README.md | 2 +- .../nemo2507/4gpus-bf16-gbs32/recipe}/Chart.yaml | 0 .../nemo2507/4gpus-bf16-gbs32/recipe}/README.md | 2 +- .../nemo2507/4gpus-bf16-gbs32/recipe}/launcher.sh | 0 .../4gpus-bf16-gbs32/recipe}/llama3-1-70b-fine-tuning.py | 0 .../recipe}/templates/workload-config-configmap.yaml | 0 .../4gpus-bf16-gbs32/recipe}/templates/workload-job.yaml | 0 .../recipe}/templates/workload-launcher-configmap.yaml | 0 .../4gpus-bf16-gbs32/recipe}/templates/workload-svc.yaml | 0 .../nemo2507/4gpus-bf16-gbs32/recipe}/values.yaml | 0 .../nemo2507/8gpus-bf16-gbs32/recipe}/Chart.yaml | 0 .../nemo2507/8gpus-bf16-gbs32/recipe}/README.md | 2 +- .../nemo2507/8gpus-bf16-gbs32/recipe}/launcher.sh | 0 .../8gpus-bf16-gbs32/recipe}/llama3-1-70b-fine-tuning.py | 0 .../recipe}/templates/workload-config-configmap.yaml | 0 .../8gpus-bf16-gbs32/recipe}/templates/workload-job.yaml | 0 .../recipe}/templates/workload-launcher-configmap.yaml | 0 .../8gpus-bf16-gbs32/recipe}/templates/workload-svc.yaml | 0 .../nemo2507/8gpus-bf16-gbs32/recipe}/values.yaml | 0 121 files changed, 27 insertions(+), 27 deletions(-) rename training/a3mega/{gpt3-175b/nemo-pretraining-gke => gpt3_175b/nemo-gke/nemo2507/recipe}/README.md (99%) rename training/a3mega/{gpt3-175b/nemo-pretraining-gke => gpt3_175b/nemo-gke/nemo2507/recipe}/values.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-gcs => llama3_70b/nemo-gke-gcs/nemo2507/recipe}/README.md (98%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-gcs => llama3_70b/nemo-gke-gcs/nemo2507/recipe}/values.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/README.md (99%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/goodput-guide.md (98%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/ksa-setup.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/kueue-merge-patch.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/train.py (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/values-gcs.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/values-supervisor.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke-resiliency => llama3_70b/nemo-gke-resiliency/nemo2507/recipe}/values.yaml (100%) rename training/a3mega/{llama3-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe}/README.md (98%) rename training/a3mega/{llama3-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe}/values.yaml (100%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe}/README.md (98%) rename training/a3mega/{llama3-1-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe}/values.yaml (100%) rename training/a3mega/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2507/recipe}/README.md (98%) rename training/a3mega/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2507/recipe}/values.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/Chart.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/README.md (97%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/custom_setup_experiment.py (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/launcher.sh (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/templates/workload-config-configmap.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/templates/workload-job.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/templates/workload-launcher-configmap.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/templates/workload-svc.yaml (100%) rename training/a3ultra/gpt_oss_120b/{nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs => nemo-gke/nemo2602/64gpus-bf16-gbs1280}/recipe/values.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/README.md (99%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/goodput-guide.md (98%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/ksa-setup.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/kueue-merge-patch.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/train.py (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/values-gcs.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/values-supervisor.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke-resiliency => llama31_405b/nemo-gke-resiliency/nemo2412/recipe}/values.yaml (100%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke => llama31_405b/nemo-gke/nemo2412/recipe}/README.md (99%) rename training/a3ultra/{llama3-1-405b/nemo-pretraining-gke => llama31_405b/nemo-gke/nemo2412/recipe}/values.yaml (100%) rename training/a3ultra/{llama3-1-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2407/recipe}/README.md (99%) rename training/a3ultra/{llama3-1-70b/nemo-pretraining-gke => llama3_70b/nemo-gke/nemo2407/recipe}/values.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/README.md (99%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/goodput-guide.md (98%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/ksa-setup.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/kueue-merge-patch.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/train.py (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/values-gcs.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/values-supervisor.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke-resiliency => mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe}/values.yaml (100%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2407/recipe}/README.md (99%) rename training/a3ultra/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2407/recipe}/values.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/Chart.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/README.md (97%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/custom_setup_experiment.py (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/launcher.sh (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/templates/workload-config-configmap.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/templates/workload-job.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/templates/workload-launcher-configmap.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/templates/workload-svc.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs => nemo-gke/nemo2602/16gpus-bf16-gbs1024}/recipe/values.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/Chart.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/README.md (97%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/custom_setup_experiment.py (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/launcher.sh (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/templates/workload-config-configmap.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/templates/workload-job.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/templates/workload-launcher-configmap.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/templates/workload-svc.yaml (100%) rename training/a3ultra/qwen3_30b_a3b/{nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs => nemo-gke/nemo2602/16gpus-fp8cs-gbs1024}/recipe/values.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/Chart.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/README.md (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/launcher.sh (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/templates/workload-config-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/templates/workload-job.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/templates/workload-launcher-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/templates/workload-svc.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/{ => recipe}/values.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/Chart.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/README.md (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/launcher.sh (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-config-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-job.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-launcher-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-svc.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/{ => recipe}/values.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/Chart.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/README.md (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/launcher.sh (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-config-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-job.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-launcher-configmap.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/templates/workload-svc.yaml (100%) rename training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/{ => recipe}/values.yaml (100%) rename training/a4/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2507/recipe}/README.md (99%) rename training/a4/{mixtral-8x7b/nemo-pretraining-gke => mixtral_8x7b/nemo-gke/nemo2507/recipe}/values.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/Chart.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/README.md (97%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/launcher.sh (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/llama3-1-70b-fine-tuning.py (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/templates/workload-config-configmap.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/templates/workload-job.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/templates/workload-launcher-configmap.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/templates/workload-svc.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/4gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe}/values.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/Chart.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/README.md (97%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/launcher.sh (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/llama3-1-70b-fine-tuning.py (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/templates/workload-config-configmap.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/templates/workload-job.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/templates/workload-launcher-configmap.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/templates/workload-svc.yaml (100%) rename training/g4/{llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 => llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe}/values.yaml (100%) diff --git a/training/a3mega/gpt3-175b/nemo-pretraining-gke/README.md b/training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe/README.md similarity index 99% rename from training/a3mega/gpt3-175b/nemo-pretraining-gke/README.md rename to training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe/README.md index 42e47e14..f9b76ff8 100644 --- a/training/a3mega/gpt3-175b/nemo-pretraining-gke/README.md +++ b/training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe/README.md @@ -115,7 +115,7 @@ recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/gpt3-175b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe ``` ### Get cluster credentials diff --git a/training/a3mega/gpt3-175b/nemo-pretraining-gke/values.yaml b/training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe/values.yaml similarity index 100% rename from training/a3mega/gpt3-175b/nemo-pretraining-gke/values.yaml rename to training/a3mega/gpt3_175b/nemo-gke/nemo2507/recipe/values.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs/README.md b/training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe/README.md similarity index 98% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs/README.md rename to training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe/README.md index eb3bb45a..928f5bc8 100644 --- a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs/README.md +++ b/training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe/README.md @@ -111,7 +111,7 @@ From your client, clone the `gpu-recipes` repository and set a reference to the git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe ``` ### Get cluster credentials diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs/values.yaml b/training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe/values.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-gcs/values.yaml rename to training/a3mega/llama3_70b/nemo-gke-gcs/nemo2507/recipe/values.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/README.md b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/README.md similarity index 99% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/README.md rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/README.md index 8cc07aef..2a6291c6 100644 --- a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/README.md +++ b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/README.md @@ -147,7 +147,7 @@ From your client, clone the `gpu-recipes` repository and set a reference to the git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe ``` ### Get cluster credentials diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/goodput-guide.md b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/goodput-guide.md similarity index 98% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/goodput-guide.md rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/goodput-guide.md index bf61708b..823d2d9d 100644 --- a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/goodput-guide.md +++ b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/goodput-guide.md @@ -26,7 +26,7 @@ Achieving high GoodPut can be challenging due to several factors common in large | **Stragglers and Performance Bottlenecks** | Slower nodes delay the entire job, underutilizing resources. | 3-7% | | **Lack of Rapid Failure Detection and Diagnosis** | Longer detection/diagnosis time increases downtime. | 2-5% | -This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Llama 3.1 70B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. +This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Llama 3.1 70B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. ## TLDR: Recommended Lego Blocks for Your Deployment For customers looking to improve GoodPut on their own ML training workloads, here’s a concise guide to the key strategies discussed in this document, presented as 'Lego blocks' you can implement: diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/ksa-setup.yaml b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/ksa-setup.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/ksa-setup.yaml rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/ksa-setup.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/kueue-merge-patch.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/kueue-merge-patch.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/train.py b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/train.py similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/train.py rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/train.py diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values-gcs.yaml b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values-gcs.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values-gcs.yaml rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values-gcs.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values-supervisor.yaml b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values-supervisor.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values-supervisor.yaml rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values-supervisor.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values.yaml b/training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values.yaml rename to training/a3mega/llama3_70b/nemo-gke-resiliency/nemo2507/recipe/values.yaml diff --git a/training/a3mega/llama3-70b/nemo-pretraining-gke/README.md b/training/a3mega/llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe/README.md similarity index 98% rename from training/a3mega/llama3-70b/nemo-pretraining-gke/README.md rename to training/a3mega/llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe/README.md index 29decc2c..2d1a8e0c 100644 --- a/training/a3mega/llama3-70b/nemo-pretraining-gke/README.md +++ b/training/a3mega/llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe/README.md @@ -113,7 +113,7 @@ recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3-70b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/old_llama3_70b ``` ### Get cluster credentials diff --git a/training/a3mega/llama3-70b/nemo-pretraining-gke/values.yaml b/training/a3mega/llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe/values.yaml similarity index 100% rename from training/a3mega/llama3-70b/nemo-pretraining-gke/values.yaml rename to training/a3mega/llama3_70b/nemo-gke/nemo2507/128gpus-bf16/recipe/values.yaml diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke/README.md b/training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/README.md similarity index 98% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke/README.md rename to training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/README.md index a89f4d86..5a190d99 100644 --- a/training/a3mega/llama3-1-70b/nemo-pretraining-gke/README.md +++ b/training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/README.md @@ -109,7 +109,7 @@ recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3-1-70b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe ``` ### Get cluster credentials diff --git a/training/a3mega/llama3-1-70b/nemo-pretraining-gke/values.yaml b/training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/values.yaml similarity index 100% rename from training/a3mega/llama3-1-70b/nemo-pretraining-gke/values.yaml rename to training/a3mega/llama3_70b/nemo-gke/nemo2507/256gpus-bf16/recipe/values.yaml diff --git a/training/a3mega/mixtral-8x7b/nemo-pretraining-gke/README.md b/training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md similarity index 98% rename from training/a3mega/mixtral-8x7b/nemo-pretraining-gke/README.md rename to training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md index 3746410f..c466c7fb 100644 --- a/training/a3mega/mixtral-8x7b/nemo-pretraining-gke/README.md +++ b/training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md @@ -106,7 +106,7 @@ recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3mega/mixtral-8x7b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe ``` ### Get cluster credentials diff --git a/training/a3mega/mixtral-8x7b/nemo-pretraining-gke/values.yaml b/training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe/values.yaml similarity index 100% rename from training/a3mega/mixtral-8x7b/nemo-pretraining-gke/values.yaml rename to training/a3mega/mixtral_8x7b/nemo-gke/nemo2507/recipe/values.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/Chart.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/Chart.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/Chart.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/Chart.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/README.md b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/README.md similarity index 97% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/README.md rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/README.md index f9ab62b3..1f9c2d61 100644 --- a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/README.md +++ b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/gpt-oss-120b/megatron-bridge-pretraining-gke/8node-BF16-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/custom_setup_experiment.py b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/custom_setup_experiment.py similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/custom_setup_experiment.py rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/custom_setup_experiment.py diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/launcher.sh b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/launcher.sh similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/launcher.sh rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/launcher.sh diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-config-configmap.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-config-configmap.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-config-configmap.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-job.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-job.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-job.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-job.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-launcher-configmap.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-launcher-configmap.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-svc.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/templates/workload-svc.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/templates/workload-svc.yaml diff --git a/training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/values.yaml b/training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/values.yaml similarity index 100% rename from training/a3ultra/gpt_oss_120b/nemo-pretraining-gke/nemo2602/64gpus_bf16_1280gbs/recipe/values.yaml rename to training/a3ultra/gpt_oss_120b/nemo-gke/nemo2602/64gpus-bf16-gbs1280/recipe/values.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/README.md b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/README.md similarity index 99% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/README.md rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/README.md index 564b3f7e..a38f53f0 100644 --- a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/README.md +++ b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/README.md @@ -159,7 +159,7 @@ From your client, clone the `gpu-recipes` repository and set a reference to the git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe ``` ### Get cluster credentials diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/goodput-guide.md b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/goodput-guide.md similarity index 98% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/goodput-guide.md rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/goodput-guide.md index 6e4a06a6..4e8c776f 100644 --- a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/goodput-guide.md +++ b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/goodput-guide.md @@ -26,7 +26,7 @@ Achieving high GoodPut can be challenging due to several factors common in large | **Stragglers and Performance Bottlenecks** | Slower nodes delay the entire job, underutilizing resources. | 3-7% | | **Lack of Rapid Failure Detection and Diagnosis** | Longer detection/diagnosis time increases downtime. | 2-5% | -This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Llama 3.1 405B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. +This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Llama 3.1 405B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. ## TLDR: Recommended Lego Blocks for Your Deployment For customers looking to improve GoodPut on their own ML training workloads, here’s a concise guide to the key strategies discussed in this document, presented as 'Lego blocks' you can implement: diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/ksa-setup.yaml b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/ksa-setup.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/ksa-setup.yaml rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/ksa-setup.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/kueue-merge-patch.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/kueue-merge-patch.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/train.py b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/train.py similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/train.py rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/train.py diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values-gcs.yaml b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values-gcs.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values-gcs.yaml rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values-gcs.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values-supervisor.yaml b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values-supervisor.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values-supervisor.yaml rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values-supervisor.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values.yaml b/training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values.yaml rename to training/a3ultra/llama31_405b/nemo-gke-resiliency/nemo2412/recipe/values.yaml diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke/README.md b/training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe/README.md similarity index 99% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke/README.md rename to training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe/README.md index 7a81d1c5..c7b9ea87 100644 --- a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke/README.md +++ b/training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe/README.md @@ -83,7 +83,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3-1-405b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3-1-405b/nemo-pretraining-gke/values.yaml b/training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe/values.yaml similarity index 100% rename from training/a3ultra/llama3-1-405b/nemo-pretraining-gke/values.yaml rename to training/a3ultra/llama31_405b/nemo-gke/nemo2412/recipe/values.yaml diff --git a/training/a3ultra/llama3-1-70b/nemo-pretraining-gke/README.md b/training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe/README.md similarity index 99% rename from training/a3ultra/llama3-1-70b/nemo-pretraining-gke/README.md rename to training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe/README.md index c5e5ec0d..48756827 100644 --- a/training/a3ultra/llama3-1-70b/nemo-pretraining-gke/README.md +++ b/training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe/README.md @@ -82,7 +82,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3-1-70b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/llama3-1-70b/nemo-pretraining-gke/values.yaml b/training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe/values.yaml similarity index 100% rename from training/a3ultra/llama3-1-70b/nemo-pretraining-gke/values.yaml rename to training/a3ultra/llama3_70b/nemo-gke/nemo2407/recipe/values.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/README.md b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/README.md similarity index 99% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/README.md rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/README.md index 81f3da12..8a5b2fa5 100644 --- a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/README.md +++ b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/README.md @@ -157,7 +157,7 @@ From your client, clone the `gpu-recipes` repository and set a reference to the git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe ``` ### Get cluster credentials diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/goodput-guide.md b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/goodput-guide.md similarity index 98% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/goodput-guide.md rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/goodput-guide.md index 518c3ad1..a8d2710f 100644 --- a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/goodput-guide.md +++ b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/goodput-guide.md @@ -26,7 +26,7 @@ Achieving high GoodPut can be challenging due to several factors common in large | **Stragglers and Performance Bottlenecks** | Slower nodes delay the entire job, underutilizing resources. | 3-7% | | **Lack of Rapid Failure Detection and Diagnosis** | Longer detection/diagnosis time increases downtime. | 2-5% | -This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Mixtral 8x7B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. +This guide provides a general overview of techniques and tools to address these common challenges and maximize ML GoodPut. While the principles discussed are broadly applicable, we will use the [Mixtral 8x7B pretraining recipe](https://github.com/AI-Hypercomputer/gpu-recipes/tree/main/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe) as a concrete case study to illustrate how these components can be implemented and customized for large-scale training workloads on Google Cloud. The goal is to showcase a "DIY" style product, where users can understand and selectively adopt these "Lego blocks" to build resilient and efficient training pipelines. ## TLDR: Recommended Lego Blocks for Your Deployment For customers looking to improve GoodPut on their own ML training workloads, here’s a concise guide to the key strategies discussed in this document, presented as 'Lego blocks' you can implement: diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/ksa-setup.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/ksa-setup.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/ksa-setup.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/ksa-setup.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/kueue-merge-patch.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/kueue-merge-patch.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/train.py b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/train.py similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/train.py rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/train.py diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values-gcs.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values-gcs.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values-gcs.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values-gcs.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values-supervisor.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values-supervisor.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values-supervisor.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values-supervisor.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke-resiliency/nemo2407/recipe/values.yaml diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/README.md b/training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe/README.md similarity index 99% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/README.md rename to training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe/README.md index 1187509b..c13e6c66 100644 --- a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/README.md +++ b/training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe/README.md @@ -81,7 +81,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/values.yaml b/training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe/values.yaml similarity index 100% rename from training/a3ultra/mixtral-8x7b/nemo-pretraining-gke/values.yaml rename to training/a3ultra/mixtral_8x7b/nemo-gke/nemo2407/recipe/values.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/Chart.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/Chart.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/Chart.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/Chart.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/README.md b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/README.md similarity index 97% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/README.md rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/README.md index 6d883620..5630c825 100644 --- a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/README.md +++ b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/qwen3-30b-a3b/megatron-bridge-pretraining-gke/2node-BF16-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/custom_setup_experiment.py b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/custom_setup_experiment.py similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/custom_setup_experiment.py rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/custom_setup_experiment.py diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/launcher.sh b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/launcher.sh similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/launcher.sh rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/launcher.sh diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-config-configmap.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-config-configmap.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-config-configmap.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-job.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-job.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-job.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-job.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-launcher-configmap.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-launcher-configmap.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-svc.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/templates/workload-svc.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/templates/workload-svc.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/values.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/values.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_bf16_1024gbs/recipe/values.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-bf16-gbs1024/recipe/values.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/Chart.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/Chart.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/Chart.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/Chart.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/README.md b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/README.md similarity index 97% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/README.md rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/README.md index 7a6ffe86..55d75393 100644 --- a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/README.md +++ b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/qwen3-30b-a3b/megatron-bridge-pretraining-gke/2node-FP8CS-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe cd $RECIPE_ROOT ``` diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/custom_setup_experiment.py b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/custom_setup_experiment.py similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/custom_setup_experiment.py rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/custom_setup_experiment.py diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/launcher.sh b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/launcher.sh similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/launcher.sh rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/launcher.sh diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-config-configmap.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-config-configmap.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-config-configmap.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-job.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-job.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-job.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-job.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-launcher-configmap.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-launcher-configmap.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-svc.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/templates/workload-svc.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/templates/workload-svc.yaml diff --git a/training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/values.yaml b/training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/values.yaml similarity index 100% rename from training/a3ultra/qwen3_30b_a3b/nemo-pretraining-gke/nemo2602/16gpus_fp8cs_1024gbs/recipe/values.yaml rename to training/a3ultra/qwen3_30b_a3b/nemo-gke/nemo2602/16gpus-fp8cs-gbs1024/recipe/values.yaml diff --git a/training/a4/llama3-8b/megatron-bridge-gke/nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe/README.md b/training/a4/llama3-8b/megatron-bridge-gke/nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe/README.md index da0562c4..26954e2e 100644 --- a/training/a4/llama3-8b/megatron-bridge-gke/nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe/README.md +++ b/training/a4/llama3-8b/megatron-bridge-gke/nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-8b/megatron-bridge-pretraining-gke/1node-FP8CS-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-8b/megatron-bridge-gke/nemo2602/8gpus-fp8cs-seq8192-gbs128/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/Chart.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/Chart.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/Chart.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/Chart.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/README.md b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/README.md similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/README.md rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/README.md diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/launcher.sh b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/launcher.sh similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/launcher.sh rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/launcher.sh diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/llama3-1-405b-seq8192-gbs128-mbs1-gpus128.py diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-config-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-config-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-config-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-job.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-job.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-job.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-job.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-launcher-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-launcher-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-svc.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/templates/workload-svc.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/templates/workload-svc.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/values.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/values.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/values.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/128gpus-fp8-gbs128/recipe/recipe/values.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/Chart.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/Chart.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/Chart.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/Chart.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/README.md b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/README.md similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/README.md rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/README.md diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/launcher.sh b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/launcher.sh similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/launcher.sh rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/launcher.sh diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/llama3-1-405b-seq8192-gbs256-mbs1-gpus256.py diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-config-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-config-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-config-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-job.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-job.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-job.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-job.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-launcher-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-launcher-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-svc.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/templates/workload-svc.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/templates/workload-svc.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/values.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/values.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/values.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/256gpus-fp8-gbs256/recipe/recipe/values.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/Chart.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/Chart.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/Chart.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/Chart.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/README.md b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/README.md similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/README.md rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/README.md diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/launcher.sh b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/launcher.sh similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/launcher.sh rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/launcher.sh diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/llama3-1-405b-seq8192-gbs2048-mbs1-gpus64.py diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-config-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-config-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-config-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-job.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-job.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-job.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-job.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-launcher-configmap.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-launcher-configmap.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-svc.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/templates/workload-svc.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/templates/workload-svc.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/values.yaml b/training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/values.yaml similarity index 100% rename from training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/values.yaml rename to training/a4/llama31_405b/nemo-gke/nemo2507/64gpus-fp8-gbs256/recipe/recipe/values.yaml diff --git a/training/a4/llama31_405b/nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe/README.md b/training/a4/llama31_405b/nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe/README.md index 46a97a24..66f18b95 100644 --- a/training/a4/llama31_405b/nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe/README.md +++ b/training/a4/llama31_405b/nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/llama31-405b/megatron-bridge-pretraining-gke/32node-FP8CS-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama31_405b/nemo-gke/nemo2602/256gpus-fp8cs-gbs256/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/README.md b/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/README.md index 75f2bd15..7a847251 100644 --- a/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/README.md +++ b/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3_70b/nemo-gke/nemo2507/256gpus-bf16-gbs256/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/mixtral-8x7b/nemo-pretraining-gke/README.md b/training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md similarity index 99% rename from training/a4/mixtral-8x7b/nemo-pretraining-gke/README.md rename to training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md index ef0106e0..75163de5 100644 --- a/training/a4/mixtral-8x7b/nemo-pretraining-gke/README.md +++ b/training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe/README.md @@ -83,7 +83,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/mixtral-8x7b/nemo-pretraining-gke +export RECIPE_ROOT=$REPO_ROOT/training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/mixtral-8x7b/nemo-pretraining-gke/values.yaml b/training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe/values.yaml similarity index 100% rename from training/a4/mixtral-8x7b/nemo-pretraining-gke/values.yaml rename to training/a4/mixtral_8x7b/nemo-gke/nemo2507/recipe/values.yaml diff --git a/training/a4/qwen3_30b_a3b/nemo-gke/nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe/README.md b/training/a4/qwen3_30b_a3b/nemo-gke/nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe/README.md index 63b6a6f4..b82e29e2 100644 --- a/training/a4/qwen3_30b_a3b/nemo-gke/nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe/README.md +++ b/training/a4/qwen3_30b_a3b/nemo-gke/nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3-30b-a3b/megatron-bridge-pretraining-gke/1node-FP8MX-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3_30b_a3b/nemo-gke/nemo2602/8gpus-fp8mx-seq4096-gbs512/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256/README.md b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256/README.md index 27f9d52b..74a29c68 100644 --- a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256/README.md +++ b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-70b/megatron-bridge-pretraining-gke/16node-FP8CS-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-bf16-gbs256 cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256/README.md b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256/README.md index 27f9d52b..ed1a68fe 100644 --- a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256/README.md +++ b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-70b/megatron-bridge-pretraining-gke/16node-FP8CS-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8cs-gbs256 cd $RECIPE_ROOT ``` diff --git a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256/README.md b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256/README.md index 8bcf110b..88d46446 100644 --- a/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256/README.md +++ b/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256/README.md @@ -73,7 +73,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3-70b/megatron-bridge-pretraining-gke/16node-FP8MX-GBSunknown/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4x/llama3_70b/nemo-gke/nemo2602/64gpus-fp8mx-gbs256 cd $RECIPE_ROOT ``` diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/Chart.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/Chart.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/Chart.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/Chart.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/README.md b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/README.md similarity index 97% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/README.md rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/README.md index 64d5dd16..41877990 100644 --- a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/README.md +++ b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/README.md @@ -72,7 +72,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/g4/llama3-1-70b/nemo-pretraining-gke/4gpu-bf16 +export RECIPE_ROOT=$REPO_ROOT/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe cd $RECIPE_ROOT ``` diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/launcher.sh b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/launcher.sh similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/launcher.sh rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/launcher.sh diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/llama3-1-70b-fine-tuning.py b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/llama3-1-70b-fine-tuning.py similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/llama3-1-70b-fine-tuning.py rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/llama3-1-70b-fine-tuning.py diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-config-configmap.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-config-configmap.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-config-configmap.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-job.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-job.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-job.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-job.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-launcher-configmap.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-launcher-configmap.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-svc.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-svc.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/templates/workload-svc.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/templates/workload-svc.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/values.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/values.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/4gpu-bf16/values.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/4gpus-bf16-gbs32/recipe/values.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/Chart.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/Chart.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/Chart.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/Chart.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/README.md b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/README.md similarity index 97% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/README.md rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/README.md index 46b6cca9..53efb508 100644 --- a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/README.md +++ b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/README.md @@ -72,7 +72,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16 +export RECIPE_ROOT=$REPO_ROOT/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe cd $RECIPE_ROOT ``` diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/launcher.sh b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/launcher.sh similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/launcher.sh rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/launcher.sh diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/llama3-1-70b-fine-tuning.py b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/llama3-1-70b-fine-tuning.py similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/llama3-1-70b-fine-tuning.py rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/llama3-1-70b-fine-tuning.py diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-config-configmap.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-config-configmap.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-config-configmap.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-job.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-job.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-job.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-job.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-launcher-configmap.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-launcher-configmap.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-svc.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-svc.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/templates/workload-svc.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/templates/workload-svc.yaml diff --git a/training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/values.yaml b/training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/values.yaml similarity index 100% rename from training/g4/llama3-1-70b/nemo-finetuning-gke/8gpu-bf16/values.yaml rename to training/g4/llama3_70b/nemo-finetuning-gke/nemo2507/8gpus-bf16-gbs32/recipe/values.yaml