From acf1f8f50e93c9717bfb3c73179ad9e99044f252 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Wed, 18 Feb 2026 12:40:31 +0000 Subject: [PATCH] chore: Migrate gsutil usage to gcloud storage --- .../prepare-model/templates/model-serve-downloader.yaml | 2 +- .../prepare-model/templates/model-serve-downloader.yaml | 2 +- inference/trillium/JetStream-Maxtext/Llama2-7B/README.md | 4 ++-- inference/v5e/JetStream-Maxtext/Llama2-7B/README.md | 4 ++-- microbenchmarks/trillium/collectives/README.md | 6 +++--- .../archive/trillium/Llama3.0-70B-PyTorch/XPK/README.md | 5 +---- training/archive/trillium/Llama3.0-8B-PyTorch/XPK/README.md | 5 +---- .../archive/trillium/Llama3.1-405B-PyTorch/XPK/README.md | 2 +- .../archive/trillium/Mixtral-8x7B-Pytorch/XPK/README.md | 5 +---- training/ironwood/wan2.1-14b/bf16-tpu7x-4x4x4/xpk/README.md | 2 +- 10 files changed, 14 insertions(+), 23 deletions(-) diff --git a/inference/trillium/JetStream-Maxtext/Llama-4-Maverick-17B-128E/prepare-model/templates/model-serve-downloader.yaml b/inference/trillium/JetStream-Maxtext/Llama-4-Maverick-17B-128E/prepare-model/templates/model-serve-downloader.yaml index d7ef84c7..cfbadd49 100644 --- a/inference/trillium/JetStream-Maxtext/Llama-4-Maverick-17B-128E/prepare-model/templates/model-serve-downloader.yaml +++ b/inference/trillium/JetStream-Maxtext/Llama-4-Maverick-17B-128E/prepare-model/templates/model-serve-downloader.yaml @@ -95,7 +95,7 @@ spec: --model-size ${MODEL_SIZE} \ --huggingface-checkpoint True - gsutil -m cp -r ${CHECKPOINT_TPU_UNSCANNED} /gcs/{{ .Values.model.name }}/output/unscanned_ckpt/checkpoints/ + gcloud storage cp --recursive ${CHECKPOINT_TPU_UNSCANNED} /gcs/{{ .Values.model.name }}/output/unscanned_ckpt/checkpoints/ echo "Conversion Job Complete. Unscanned checkpoints should be at ${CHECKPOINT_TPU_UNSCANNED}" diff --git a/inference/trillium/JetStream-Maxtext/Llama-4-Scout-17B-16E/prepare-model/templates/model-serve-downloader.yaml b/inference/trillium/JetStream-Maxtext/Llama-4-Scout-17B-16E/prepare-model/templates/model-serve-downloader.yaml index b6b7de87..79ca655b 100644 --- a/inference/trillium/JetStream-Maxtext/Llama-4-Scout-17B-16E/prepare-model/templates/model-serve-downloader.yaml +++ b/inference/trillium/JetStream-Maxtext/Llama-4-Scout-17B-16E/prepare-model/templates/model-serve-downloader.yaml @@ -106,7 +106,7 @@ spec: echo "Conversion Job Complete. Unscanned checkpoints should be at ${CHECKPOINT_TPU_UNSCANNED}" echo "Copying unscanned checkpoints to GCS bucket..." - gsutil -m cp -r ${CHECKPOINT_TPU_UNSCANNED} gs://${GCS_FUSE_BUCKET}/{{ .Values.model.name }}/output/unscanned_ckpt/ + gcloud storage cp --recursive ${CHECKPOINT_TPU_UNSCANNED} gs://${GCS_FUSE_BUCKET}/{{ .Values.model.name }}/output/unscanned_ckpt/ echo "Finished copying unscanned checkpoints to gs://${GCS_FUSE_BUCKET}/{{ .Values.model.name }}/output/unscanned_ckpt/" volumeMounts: diff --git a/inference/trillium/JetStream-Maxtext/Llama2-7B/README.md b/inference/trillium/JetStream-Maxtext/Llama2-7B/README.md index a74b4ede..87f07d02 100644 --- a/inference/trillium/JetStream-Maxtext/Llama2-7B/README.md +++ b/inference/trillium/JetStream-Maxtext/Llama2-7B/README.md @@ -42,7 +42,7 @@ bash download.sh # When prompted, choose 7B. This should create a directory llam export CHKPT_BUCKET=gs://... export MAXTEXT_BUCKET_SCANNED=gs://... export MAXTEXT_BUCKET_UNSCANNED=gs://... -gsutil cp -r llama/llama-2-7b/* ${CHKPT_BUCKET} +gcloud storage cp --recursive llama/llama-2-7b/* ${CHKPT_BUCKET} # Checkpoint conversion @@ -117,4 +117,4 @@ Mean TPOT: 5052.76 ms Median TPOT: 164.01 ms P99 TPOT: 112171.56 ms -``` +``` \ No newline at end of file diff --git a/inference/v5e/JetStream-Maxtext/Llama2-7B/README.md b/inference/v5e/JetStream-Maxtext/Llama2-7B/README.md index c0a40bef..4bf289af 100644 --- a/inference/v5e/JetStream-Maxtext/Llama2-7B/README.md +++ b/inference/v5e/JetStream-Maxtext/Llama2-7B/README.md @@ -42,7 +42,7 @@ bash download.sh # When prompted, choose 7B. This should create a directory llam export CHKPT_BUCKET=gs://... export MAXTEXT_BUCKET_SCANNED=gs://... export MAXTEXT_BUCKET_UNSCANNED=gs://... -gsutil cp -r llama/llama-2-7b ${CHKPT_BUCKET} +gcloud storage cp --recursive llama/llama-2-7b ${CHKPT_BUCKET} # Checkpoint conversion @@ -117,4 +117,4 @@ Mean TPOT: 5052.76 ms Median TPOT: 164.01 ms P99 TPOT: 112171.56 ms -``` +``` \ No newline at end of file diff --git a/microbenchmarks/trillium/collectives/README.md b/microbenchmarks/trillium/collectives/README.md index 3b1089e5..00ed6118 100644 --- a/microbenchmarks/trillium/collectives/README.md +++ b/microbenchmarks/trillium/collectives/README.md @@ -33,7 +33,7 @@ psum_ici: Matrix size: 17408x17408, dtype=, matrix_s Results will be printed out and also stored at `/tmp/microbenchmarks/collectives`. You can save the stored results to GCS by adding the following to `--command` in the XPK command: ``` -gsutil cp -r /tmp/microbenchmarks/collectives gs:// +gcloud storage cp --recursive /tmp/microbenchmarks/collectives gs:// ``` ### Run with a custom yaml config @@ -41,7 +41,7 @@ If you would like to run with a custom defined yaml with modified configurations Start by creating a yaml file `your_config.yaml`. Take a look at [1x_v6e_256.yaml](https://github.com/AI-Hypercomputer/accelerator-microbenchmarks/blob/35c10a42e8cfab7593157327dd3ad3150e4c001d/configs/1x_v6e_256.yaml) for an example yaml config. Then upload it to your GCS bucket: ``` -gsutil cp your_config.yaml gs:// +gcloud storage cp your_config.yaml gs:// ``` Then use a modified launch command that pulls the yaml file from GCS and references it in the benchmark command: @@ -51,7 +51,7 @@ python3 ~/xpk/xpk.py workload create \ --project=${PROJECT} \ --zone=${ZONE} \ --device-type=v6e-256 \ - --command="git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git && cd accelerator-microbenchmarks && git checkout trillium-collectives && pip install -r requirements.txt && echo '4096 41943040 314572800' > /proc/sys/net/ipv4/tcp_rmem && export LIBTPU_INIT_ARGS='--megascale_grpc_premap_memory_bytes=17179869184 --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true' && gsutil cp gs:///your_config.yaml configs/ && python src/run_benchmark.py --config=configs/your_config.yaml" \ + --command="git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git && cd accelerator-microbenchmarks && git checkout trillium-collectives && pip install -r requirements.txt && echo '4096 41943040 314572800' > /proc/sys/net/ipv4/tcp_rmem && export LIBTPU_INIT_ARGS='--megascale_grpc_premap_memory_bytes=17179869184 --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true' && gcloud storage cp gs:///your_config.yaml configs/ && python src/run_benchmark.py --config=configs/your_config.yaml" \ --num-slices=1 \ --docker-image=us-docker.pkg.dev/cloud-tpu-images/jax-stable-stack/tpu:jax0.5.2-rev1 \ --workload=${WORKLOAD_NAME} diff --git a/training/archive/trillium/Llama3.0-70B-PyTorch/XPK/README.md b/training/archive/trillium/Llama3.0-70B-PyTorch/XPK/README.md index c480aa85..c512be82 100644 --- a/training/archive/trillium/Llama3.0-70B-PyTorch/XPK/README.md +++ b/training/archive/trillium/Llama3.0-70B-PyTorch/XPK/README.md @@ -1,5 +1,3 @@ - - # Instructions for training Llama 3.0 70B on Trillium TPU on multipod using XPK ## Environment Steup @@ -88,7 +86,7 @@ You can use the profile export PROFILE_SCRIPT_PATH=../../../../utils/ # download the profile from gcp bucket to local -gsutil cp -r $PROFILE_LOG_DIR ./ +gcloud storage cp --recursive $PROFILE_LOG_DIR ./ # locate the xplane.pb file and process PYTHONPATH==$PROFILE_SCRIPT_PATH:$PYTHONPATH python $PROFILE_SCRIPT_PATH/profile_convert.py xplane.pb @@ -112,4 +110,3 @@ Plane ID: 2, Name: /device:TPU:0 Got 10 iterations 1.8454 ``` - diff --git a/training/archive/trillium/Llama3.0-8B-PyTorch/XPK/README.md b/training/archive/trillium/Llama3.0-8B-PyTorch/XPK/README.md index 901c2ef4..e5eb9602 100644 --- a/training/archive/trillium/Llama3.0-8B-PyTorch/XPK/README.md +++ b/training/archive/trillium/Llama3.0-8B-PyTorch/XPK/README.md @@ -1,5 +1,3 @@ - - # Instructions for training Llama 3.0 8B on Trillium TPU on multipod using XPK ## Environment Steup @@ -87,7 +85,7 @@ You can use the profile export PROFILE_SCRIPT_PATH=../../../../utils/ # download the profile from gcp bucket to local -gsutil cp -r $PROFILE_LOG_DIR ./ +gcloud storage cp --recursive $PROFILE_LOG_DIR ./ # locate the xplane.pb file and process PYTHONPATH==$PROFILE_SCRIPT_PATH:$PYTHONPATH python $PROFILE_SCRIPT_PATH/profile_convert.py xplane.pb @@ -111,4 +109,3 @@ Plane ID: 2, Name: /device:TPU:0 Got 10 iterations 1.8454 ``` - diff --git a/training/archive/trillium/Llama3.1-405B-PyTorch/XPK/README.md b/training/archive/trillium/Llama3.1-405B-PyTorch/XPK/README.md index a27ce13c..872dbce7 100644 --- a/training/archive/trillium/Llama3.1-405B-PyTorch/XPK/README.md +++ b/training/archive/trillium/Llama3.1-405B-PyTorch/XPK/README.md @@ -99,7 +99,7 @@ You can use the profile export PROFILE_SCRIPT_PATH=../../../../utils/ # download the profile from gcp bucket to local -gsutil cp -r $PROFILE_LOG_DIR ./ +gcloud storage cp --recursive $PROFILE_LOG_DIR ./ # locate the profile output ending with ".pb". # Name it xplane.pb file, and process it diff --git a/training/archive/trillium/Mixtral-8x7B-Pytorch/XPK/README.md b/training/archive/trillium/Mixtral-8x7B-Pytorch/XPK/README.md index f2946bd0..f6be87fd 100644 --- a/training/archive/trillium/Mixtral-8x7B-Pytorch/XPK/README.md +++ b/training/archive/trillium/Mixtral-8x7B-Pytorch/XPK/README.md @@ -1,5 +1,3 @@ - - # Instructions for training Mixtral 8x7B on Trillium TPU on multipod using XPK ## Environment Steup @@ -87,7 +85,7 @@ You can use the profile # this is the place we place the profile processing script export PROFILE_SCRIPT_PATH=../../../../utils/ # download the profile from gcp bucket to local -gsutil cp -r $PROFILE_LOG_DIR ./ +gcloud storage cp --recursive $PROFILE_LOG_DIR ./ # locate the xplane.pb file and process PYTHONPATH==$PROFILE_SCRIPT_PATH:$PYTHONPATH python $PROFILE_SCRIPT_PATH/profile_convert.py xplane.pb ``` @@ -110,4 +108,3 @@ Plane ID: 2, Name: /device:TPU:0 Got 10 iterations 1.8454 ``` - diff --git a/training/ironwood/wan2.1-14b/bf16-tpu7x-4x4x4/xpk/README.md b/training/ironwood/wan2.1-14b/bf16-tpu7x-4x4x4/xpk/README.md index 98559794..a8ae5960 100644 --- a/training/ironwood/wan2.1-14b/bf16-tpu7x-4x4x4/xpk/README.md +++ b/training/ironwood/wan2.1-14b/bf16-tpu7x-4x4x4/xpk/README.md @@ -243,7 +243,7 @@ huggingface-cli download RaphaelLiu/PusaV1_training --repo-type dataset --local- python src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py src/maxdiffusion/configs/base_wan_14b.yml train_data_dir=${HF_DATASET_DIR} tfrecords_dir=${TFRECORDS_DATASET_DIR} no_records_per_shard=10 skip_jax_distributed_system=True # Upload to gcs -gsutil -m cp -r ${TFRECORDS_DATASET_DIR} ${DATASET_DIR} +gcloud storage cp --recursive ${TFRECORDS_DATASET_DIR} ${DATASET_DIR} ``` ## Run the recipe