diff --git a/examples/mimo/scripts/run_hetero_nemotron_54l_hel_train.sh b/examples/mimo/scripts/run_hetero_nemotron_54l_hel_train.sh index 3d206961c43..70828a8c10b 100755 --- a/examples/mimo/scripts/run_hetero_nemotron_54l_hel_train.sh +++ b/examples/mimo/scripts/run_hetero_nemotron_54l_hel_train.sh @@ -96,7 +96,7 @@ LR_WSD_DECAY_SAMPLES="${LR_WSD_DECAY_SAMPLES:-}" LR_WSD_DECAY_STYLE="${LR_WSD_DECAY_STYLE:-}" TRAIN_SAMPLES="${TRAIN_SAMPLES:-}" PACKING_BUFFER_SIZE="${PACKING_BUFFER_SIZE:-128}" -NUM_WORKERS="${NUM_WORKERS:-1}" +NUM_WORKERS="${NUM_WORKERS:-2}" SHUFFLE_BUFFER_SIZE="${SHUFFLE_BUFFER_SIZE:-100}" MAX_SAMPLES_PER_SEQUENCE="${MAX_SAMPLES_PER_SEQUENCE:-100}" LOG_INTERVAL="${LOG_INTERVAL:-10}" diff --git a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_100n.sh b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_100n.sh index ff6d5ad1c99..ee30b406ecd 100755 --- a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_100n.sh +++ b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_100n.sh @@ -8,7 +8,7 @@ # LR_WSD_DECAY_SAMPLES=18310547, LR_WSD_DECAY_STYLE=minus_sqrt # * PACKING_BUFFER_SIZE=128 # * SAVE_INTERVAL=1000 (LOG_INTERVAL=1 for per-iter visibility) -# * NUM_WORKERS=1 +# * NUM_WORKERS=2 # Deviations: LLM_EP=8 (vs 16), hetero TP=2 (vs 4), force-LB=0, no MTP. #SBATCH -A nemotron_n4_pre @@ -72,7 +72,7 @@ TRAINING_STAGE=stage2 MODEL_PROVIDER=nemotron-moe-vlm-54l ENABLE_EXPERIMENTAL=1 MOE_ROUTER_FORCE_LOAD_BALANCING=0 -NUM_WORKERS=1 +NUM_WORKERS=2 PACKING_BUFFER_SIZE=128 SHUFFLE_BUFFER_SIZE=100 MAX_SAMPLES_PER_SEQUENCE=100 diff --git a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_33n_ep8.sh b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_33n_ep8.sh index 1caf4221f79..6ec8336fa0b 100755 --- a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_33n_ep8.sh +++ b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_33n_ep8.sh @@ -8,7 +8,7 @@ # LR_WSD_DECAY_SAMPLES=18310547, LR_WSD_DECAY_STYLE=minus_sqrt # * PACKING_BUFFER_SIZE=128 # * SAVE_INTERVAL=1000 (LOG_INTERVAL=1 for per-iter visibility) -# * NUM_WORKERS=1 +# * NUM_WORKERS=2 # Deviations from Sanjeev's baseline: # * LLM_EP=8 (vs Sanjeev's EP=16) — kept from our scaling study # * Hetero topology TP=2 (vs Sanjeev's TP=4) @@ -77,7 +77,7 @@ TRAINING_STAGE=stage2 MODEL_PROVIDER=nemotron-moe-vlm-54l ENABLE_EXPERIMENTAL=1 MOE_ROUTER_FORCE_LOAD_BALANCING=0 -NUM_WORKERS=1 +NUM_WORKERS=2 PACKING_BUFFER_SIZE=128 SHUFFLE_BUFFER_SIZE=100 MAX_SAMPLES_PER_SEQUENCE=100 diff --git a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_68n_ep8.sh b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_68n_ep8.sh index c6676343901..3f178cfad6f 100755 --- a/examples/mimo/scripts/sbatch_hetero_prod_gbs768_68n_ep8.sh +++ b/examples/mimo/scripts/sbatch_hetero_prod_gbs768_68n_ep8.sh @@ -8,7 +8,7 @@ # LR_WSD_DECAY_SAMPLES=18310547, LR_WSD_DECAY_STYLE=minus_sqrt # * PACKING_BUFFER_SIZE=128 # * SAVE_INTERVAL=1000 (LOG_INTERVAL=1 for per-iter visibility) -# * NUM_WORKERS=1 +# * NUM_WORKERS=2 # Deviations: LLM_EP=8 (vs 16), hetero TP=2 (vs 4), force-LB=0, no MTP. #SBATCH -A nemotron_n4_pre @@ -72,7 +72,7 @@ TRAINING_STAGE=stage2 MODEL_PROVIDER=nemotron-moe-vlm-54l ENABLE_EXPERIMENTAL=1 MOE_ROUTER_FORCE_LOAD_BALANCING=0 -NUM_WORKERS=1 +NUM_WORKERS=2 PACKING_BUFFER_SIZE=128 SHUFFLE_BUFFER_SIZE=100 MAX_SAMPLES_PER_SEQUENCE=100