From 0600c360010605aaba9fb0353707ba2b3aafb2eb Mon Sep 17 00:00:00 2001 From: Bryan Wu Date: Mon, 9 Mar 2026 20:20:07 +0000 Subject: [PATCH] Add Qwen3-Coder-480B-A35B-Instruct recipe for vLLM on ironwood --- .../Qwen3-Coder-480B-A35B-Instruct/Chart.yaml | 6 + .../Qwen3-Coder-480B-A35B-Instruct/README.md | 100 +++++++ .../templates/benchmark.yaml | 256 ++++++++++++++++++ .../values.yaml | 57 ++++ 4 files changed, 419 insertions(+) create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml new file mode 100644 index 00000000..980ac6d1 --- /dev/null +++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: tpu7x-benchmark +description: tpu7x-benchmark +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md new file mode 100644 index 00000000..b75a6118 --- /dev/null +++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md @@ -0,0 +1,100 @@ + +Note to recipe publishers: + +- You must manually remove recipe_launch_script.sh. +- You must manually remove this section describing additional recipe publishing steps. + +# Run vLLM qwen3_coder_480b_a35b_instruct on tpu7x on GKE + +This recipe covers running a vLLM inference workload on tpu7x on +GKE. + +## Create the GKE Cluster +Create your tpu7x cluster using [XPK](https://github.com/AI-Hypercomputer/xpk). +The next sections assume you have created a cluster with tpu7x +nodes. + +## Deploy vLLM Workload on GKE + +### Configure kubectl to communicate with your cluster + +``` +gcloud container clusters get-credentials ${CLUSTER_NAME} --location=${LOCATION} +``` + +### Generate a new Hugging Face token if you don't already have one + +On the huggingface website, create an account if necessary, and go to Your +Profile > Settings > Access Tokens. +Select Create new token. +Specify a name of your choice and a role with at least Read permissions. +Select Generate a token, follow the prompts and save your token. + +(NOTE: Also ensure that your account has access to the model on Hugging Face. +For example, for llama3, you will need to explicitly get permission): + +### Run the benchmark +In this directory, run: + +``` +helm install ${RUN_NAME} . --set hf_token=${HF_TOKEN} +``` + +The benchmark will launch a client and server pod. + +On the server pod, at the end of the server startup you’ll see logs such as: + +``` +$ kubectl logs deployment/vllm-tpu -f + +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +``` + +The client pod will wait until the server is up and then start the benchmark. +On the client pod, you'll see logs such as: + +``` +$ kubectl logs -f vllm-bench + +============ Serving Benchmark Result ============ +Successful requests: 10 +Failed requests: 0 +Benchmark duration (s): xx +Total input tokens: xxx +Total generated tokens: xxx +Request throughput (req/s): xx +Output token throughput (tok/s): xxx +Peak output token throughput (tok/s): xxx +Peak concurrent requests: 10.00 +Total Token throughput (tok/s): xxx +---------------Time to First Token---------------- +Mean TTFT (ms): xxx +Median TTFT (ms): xxx +P99 TTFT (ms): xxx +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): xxx +Median TPOT (ms): xxx +P99 TPOT (ms): xxx +---------------Inter-token Latency---------------- +Mean ITL (ms): xxx +Median ITL (ms): xxx +P99 ITL (ms): xxx +================================================== +``` + +### Customizing the benchmark + +In order to change the parameters of the benchmark, such as the model, number of +prompts, etc, you can modify the settings in values.yaml. + +**IMPORTANT:** Before running the recipe, you must edit `values.yaml` and replace placeholders like `` and `` with your actual GCP service account and model path/ID. + +### Cleanup +When you are done running the benchmark, the server will still be running. You +can cleanup by running: + +``` +helm uninstall ${RUN_NAME} +``` diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml new file mode 100644 index 00000000..ae4b46e2 --- /dev/null +++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml @@ -0,0 +1,256 @@ +# yamllint disable +{{- if not .Values.is_multi_host }} +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: "{{ .Release.Name }}-hyperdisk-balanced-tpu" +provisioner: pd.csi.storage.gke.io +parameters: + type: hyperdisk-balanced + # provisioned-iops: "3000" # Optional: Adjust IOPS as needed + # provisioned-throughput: "140" # Optional: Adjust Throughput (MiB/s) as needed +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "{{ .Release.Name }}-hd-claim" +spec: + storageClassName: "{{ .Release.Name }}-hyperdisk-balanced-tpu" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{.Values.server.data_disk_size}} +--- +{{- end }} +apiVersion: v1 +kind: Secret +metadata: + name: "{{ .Release.Name }}-hf-secret" +type: Opaque +stringData: + hf_api_token: {{.Values.hf_token}} +--- +{{- if .Values.gcp_service_account }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.k8s_service_account | quote }} + namespace: default + annotations: + iam.gke.io/gcp-service-account: {{ .Values.gcp_service_account }} +{{- end }} +--- +{{- if not .Values.is_multi_host }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-tpu-server" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-tpu-pod" + template: + metadata: + labels: + app: "{{ .Release.Name }}-tpu-pod" + spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: {{.Values.server.topology}} + containers: + - name: vllm-tpu + image: {{.Values.server.image}} + command: ["/bin/bash", "-c"] + args: + - {{ .Values.server.bash_command | quote }} + env: + - name: HF_HOME + value: /data + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-hf-secret" + key: hf_api_token + {{if .Values.server.model_impl_type}} + - name: MODEL_IMPL_TYPE + value: {{.Values.server.model_impl_type}} + {{end}} + ports: + - containerPort: 8000 + resources: + limits: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + requests: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + readinessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: "/data" + name: data-volume + - mountPath: /dev/shm + name: dshm + volumes: + - emptyDir: + medium: Memory + name: dshm + - name: data-volume + persistentVolumeClaim: + claimName: "{{ .Release.Name }}-hd-claim" +{{- else }} +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: "{{ .Release.Name }}-tpu-server" + annotations: + leaderworkerset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool +spec: + replicas: 1 + leaderWorkerTemplate: + size: {{ .Values.server.num_nodes }} + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + app: "{{ .Release.Name }}-tpu-pod" + spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: {{.Values.server.topology}} + containers: + - name: vllm-leader + image: {{.Values.server.image}} + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-hf-secret" + key: hf_api_token + - name: TPU_MULTIHOST_BACKEND + value: ray + - name: TPU_BACKEND_TYPE + value: jax + - name: JAX_PLATFORMS + value: "" + {{if .Values.server.model_impl_type}} + - name: MODEL_IMPL_TYPE + value: {{.Values.server.model_impl_type}} + {{end}} + command: ["/bin/bash", "-c"] + args: + - {{ .Values.server.bash_command | quote }} + resources: + limits: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + requests: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + ports: + - containerPort: 8000 + readinessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - emptyDir: + medium: Memory + name: dshm + workerTemplate: + spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu7x + cloud.google.com/gke-tpu-topology: {{.Values.server.topology}} + containers: + - name: vllm-worker + image: {{.Values.server.image}} + command: + - sh + - -c + - {{ .Values.server.worker_bash_command | quote }} + resources: + limits: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + requests: + google.com/tpu: "{{.Values.server.num_chips_per_node}}" + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-hf-secret" + key: hf_api_token + - name: TPU_MULTIHOST_BACKEND + value: ray + - name: TPU_BACKEND_TYPE + value: jax + - name: JAX_PLATFORMS + value: "" + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - emptyDir: + medium: Memory + name: dshm + +{{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-vllm-service" +spec: + selector: + app: "{{ .Release.Name }}-tpu-pod" + {{- if .Values.is_multi_host }} + role: leader + {{- end }} + type: LoadBalancer + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 +--- +apiVersion: v1 +kind: Pod +metadata: + name: "{{ .Release.Name }}-client" +spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + terminationGracePeriodSeconds: 60 + containers: + - name: vllm-bench + image: {{.Values.client.image}} + command: ["/bin/bash", "-c"] + args: ["{{.Values.client.bash_command}}"] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + key: hf_api_token + name: "{{ .Release.Name }}-hf-secret" + - name: SERVER_HOSTNAME + value: "{{ .Release.Name }}-vllm-service.default.svc.cluster.local" + restartPolicy: Never diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml new file mode 100644 index 00000000..d5ba8bae --- /dev/null +++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml @@ -0,0 +1,57 @@ +"client": + "bash_command": |- + while ! curl http://${SERVER_HOSTNAME}:8000/ping; do sleep 30 && echo 'Waiting for server...'; done + + vllm bench serve \ + --dataset-name=random \ + --random-input-len=512 \ + --random-output-len=1024 \ + --num-prompts=1000 \ + --host=${SERVER_HOSTNAME} \ + --port=8000 \ + --model=Qwen/Qwen3-Coder-480B-A35B-Instruct \ + --max-concurrency=64 + "image": |- + vllm/vllm-tpu:nightly-ironwood-20260224-487a8c1-f91808a +"gcp_service_account": |- + +"hf_token": !!null |- + null +"is_multi_host": !!bool |- + true +"k8s_service_account": |- + vllm-sa +"server": + "bash_command": |- + bash /workspace/vllm/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); + VLLM_DISABLE_SHARED_EXPERTS_STREAM=1 python3 -m vllm.entrypoints.openai.api_server \ + --host=0.0.0.0 \ + --port=8000 \ + --download-dir=/data \ + --tensor-parallel-size=16 \ + --max-model-len=4096 \ + --load-format=runai_streamer \ + --kv-cache-dtype=fp8 \ + --gpu-memory-utilization=0.93 \ + --data-parallel-size=1 \ + --max-num-batched-tokens=2048 \ + --max-num-seqs=512 \ + --model= \ + --served-model-name=Qwen/Qwen3-Coder-480B-A35B-Instruct \ + --no-enable-prefix-caching \ + --no-async-scheduling \ + --enable-expert-parallel + "data_disk_size": |- + 100Gi + "image": |- + vllm/vllm-tpu:nightly-ironwood-20260224-487a8c1-f91808a + "model_impl_type": |- + vllm + "num_chips_per_node": !!int |- + 4 + "num_nodes": !!int |- + 2 + "topology": |- + 2x2x2 + "worker_bash_command": |- + bash /workspace/vllm/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)