From 0600c360010605aaba9fb0353707ba2b3aafb2eb Mon Sep 17 00:00:00 2001
From: Bryan Wu <bwuu@google.com>
Date: Mon, 9 Mar 2026 20:20:07 +0000
Subject: [PATCH] Add Qwen3-Coder-480B-A35B-Instruct recipe for vLLM on
 ironwood

---
 .../Qwen3-Coder-480B-A35B-Instruct/Chart.yaml |   6 +
 .../Qwen3-Coder-480B-A35B-Instruct/README.md  | 100 +++++++
 .../templates/benchmark.yaml                  | 256 ++++++++++++++++++
 .../values.yaml                               |  57 ++++
 4 files changed, 419 insertions(+)
 create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml
 create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md
 create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml
 create mode 100644 inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml

diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml
new file mode 100644
index 00000000..980ac6d1
--- /dev/null
+++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: tpu7x-benchmark
+description: tpu7x-benchmark
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md
new file mode 100644
index 00000000..b75a6118
--- /dev/null
+++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/README.md
@@ -0,0 +1,100 @@
+
+Note to recipe publishers:
+
+- You must manually remove recipe_launch_script.sh.
+- You must manually remove this section describing additional recipe publishing steps.
+
+# Run vLLM qwen3_coder_480b_a35b_instruct on tpu7x on GKE
+
+This recipe covers running a vLLM inference workload on tpu7x on
+GKE.
+
+## Create the GKE Cluster
+Create your tpu7x cluster using [XPK](https://github.com/AI-Hypercomputer/xpk).
+The next sections assume you have created a cluster with tpu7x
+nodes.
+
+## Deploy vLLM Workload on GKE
+
+### Configure kubectl to communicate with your cluster
+
+```
+gcloud container clusters get-credentials ${CLUSTER_NAME} --location=${LOCATION}
+```
+
+### Generate a new Hugging Face token if you don't already have one
+
+On the huggingface website, create an account if necessary, and go to Your
+Profile > Settings > Access Tokens.
+Select Create new token.
+Specify a name of your choice and a role with at least Read permissions.
+Select Generate a token, follow the prompts and save your token.
+
+(NOTE: Also ensure that your account has access to the model on Hugging Face.
+For example, for llama3, you will need to explicitly get permission):
+
+### Run the benchmark
+In this directory, run:
+
+```
+helm install ${RUN_NAME} . --set hf_token=${HF_TOKEN}
+```
+
+The benchmark will launch a client and server pod.
+
+On the server pod, at the end of the server startup you’ll see logs such as:
+
+```
+$ kubectl logs deployment/vllm-tpu -f
+
+(APIServer pid=1) INFO:     Started server process [1]
+(APIServer pid=1) INFO:     Waiting for application startup.
+(APIServer pid=1) INFO:     Application startup complete.
+```
+
+The client pod will wait until the server is up and then start the benchmark.
+On the client pod, you'll see logs such as:
+
+```
+$ kubectl logs -f vllm-bench
+
+============ Serving Benchmark Result ============
+Successful requests:                     10
+Failed requests:                         0
+Benchmark duration (s):                  xx
+Total input tokens:                      xxx
+Total generated tokens:                  xxx
+Request throughput (req/s):              xx
+Output token throughput (tok/s):         xxx
+Peak output token throughput (tok/s):    xxx
+Peak concurrent requests:                10.00
+Total Token throughput (tok/s):          xxx
+---------------Time to First Token----------------
+Mean TTFT (ms):                          xxx
+Median TTFT (ms):                        xxx
+P99 TTFT (ms):                           xxx
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          xxx
+Median TPOT (ms):                       xxx
+P99 TPOT (ms):                           xxx
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           xxx
+Median ITL (ms):                         xxx
+P99 ITL (ms):                            xxx
+==================================================
+```
+
+### Customizing the benchmark
+
+In order to change the parameters of the benchmark, such as the model, number of
+prompts, etc, you can modify the settings in values.yaml.
+
+**IMPORTANT:** Before running the recipe, you must edit `values.yaml` and replace placeholders like `<YOUR_GCP_SERVICE_ACCOUNT>` and `<MODEL_ID>` with your actual GCP service account and model path/ID.
+
+### Cleanup
+When you are done running the benchmark, the server will still be running. You
+can cleanup by running:
+
+```
+helm uninstall ${RUN_NAME}
+```
diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml
new file mode 100644
index 00000000..ae4b46e2
--- /dev/null
+++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/templates/benchmark.yaml
@@ -0,0 +1,256 @@
+# yamllint disable
+{{- if not .Values.is_multi_host }}
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: "{{ .Release.Name }}-hyperdisk-balanced-tpu"
+provisioner: pd.csi.storage.gke.io
+parameters:
+  type: hyperdisk-balanced
+  # provisioned-iops: "3000" # Optional: Adjust IOPS as needed
+  # provisioned-throughput: "140" # Optional: Adjust Throughput (MiB/s) as needed
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+allowVolumeExpansion: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-hd-claim"
+spec:
+  storageClassName: "{{ .Release.Name }}-hyperdisk-balanced-tpu"
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{.Values.server.data_disk_size}}
+---
+{{- end }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-hf-secret"
+type: Opaque
+stringData:
+  hf_api_token: {{.Values.hf_token}}
+---
+{{- if .Values.gcp_service_account }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ .Values.k8s_service_account | quote }}
+  namespace: default
+  annotations:
+    iam.gke.io/gcp-service-account: {{ .Values.gcp_service_account }}
+{{- end }}
+---
+{{- if not .Values.is_multi_host }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-tpu-server"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: "{{ .Release.Name }}-tpu-pod"
+  template:
+    metadata:
+      labels:
+        app: "{{ .Release.Name }}-tpu-pod"
+    spec:
+      {{- if .Values.gcp_service_account }}
+      serviceAccountName: {{ .Values.k8s_service_account | quote }}
+      {{- end }}
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu7x
+        cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+      containers:
+      - name: vllm-tpu
+        image: {{.Values.server.image}}
+        command: ["/bin/bash", "-c"]
+        args:
+        - {{ .Values.server.bash_command | quote }}
+        env:
+        - name: HF_HOME
+          value: /data
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: "{{ .Release.Name }}-hf-secret"
+              key: hf_api_token
+        {{if .Values.server.model_impl_type}}
+        - name: MODEL_IMPL_TYPE
+          value: {{.Values.server.model_impl_type}}
+        {{end}}
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          requests:
+            google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+        readinessProbe:
+          tcpSocket:
+            port: 8000
+          initialDelaySeconds: 15
+          periodSeconds: 10
+        volumeMounts:
+        - mountPath: "/data"
+          name: data-volume
+        - mountPath: /dev/shm
+          name: dshm
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: data-volume
+        persistentVolumeClaim:
+          claimName: "{{ .Release.Name }}-hd-claim"
+{{- else }}
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: "{{ .Release.Name }}-tpu-server"
+  annotations:
+    leaderworkerset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: {{ .Values.server.num_nodes }}
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+          app: "{{ .Release.Name }}-tpu-pod"
+      spec:
+        {{- if .Values.gcp_service_account }}
+        serviceAccountName: {{ .Values.k8s_service_account | quote }}
+        {{- end }}
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu7x
+          cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+        containers:
+        - name: vllm-leader
+          image: {{.Values.server.image}}
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: "{{ .Release.Name }}-hf-secret"
+                key: hf_api_token
+          - name: TPU_MULTIHOST_BACKEND
+            value: ray
+          - name: TPU_BACKEND_TYPE
+            value: jax
+          - name: JAX_PLATFORMS
+            value: ""
+          {{if .Values.server.model_impl_type}}
+          - name: MODEL_IMPL_TYPE
+            value: {{.Values.server.model_impl_type}}
+          {{end}}
+          command: ["/bin/bash", "-c"]
+          args:
+          - {{ .Values.server.bash_command | quote }}
+          resources:
+            limits:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+            requests:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          ports:
+          - containerPort: 8000
+          readinessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 15
+            periodSeconds: 10
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+    workerTemplate:
+      spec:
+        {{- if .Values.gcp_service_account }}
+        serviceAccountName: {{ .Values.k8s_service_account | quote }}
+        {{- end }}
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu7x
+          cloud.google.com/gke-tpu-topology: {{.Values.server.topology}}
+        containers:
+        - name: vllm-worker
+          image: {{.Values.server.image}}
+          command:
+            - sh
+            - -c
+            - {{ .Values.server.worker_bash_command | quote }}
+          resources:
+            limits:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+            requests:
+              google.com/tpu: "{{.Values.server.num_chips_per_node}}"
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: "{{ .Release.Name }}-hf-secret"
+                key: hf_api_token
+          - name: TPU_MULTIHOST_BACKEND
+            value: ray
+          - name: TPU_BACKEND_TYPE
+            value: jax
+          - name: JAX_PLATFORMS
+            value: ""
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+
+{{- end }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-vllm-service"
+spec:
+  selector:
+    app: "{{ .Release.Name }}-tpu-pod"
+    {{- if .Values.is_multi_host }}
+    role: leader
+    {{- end }}
+  type: LoadBalancer
+  ports:
+  - name: http
+    protocol: TCP
+    port: 8000
+    targetPort: 8000
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ .Release.Name }}-client"
+spec:
+  {{- if .Values.gcp_service_account }}
+  serviceAccountName: {{ .Values.k8s_service_account | quote }}
+  {{- end }}
+  terminationGracePeriodSeconds: 60
+  containers:
+  - name: vllm-bench
+    image: {{.Values.client.image}}
+    command: ["/bin/bash", "-c"]
+    args: ["{{.Values.client.bash_command}}"]
+    env:
+    - name: HUGGING_FACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          key: hf_api_token
+          name: "{{ .Release.Name }}-hf-secret"
+    - name: SERVER_HOSTNAME
+      value: "{{ .Release.Name }}-vllm-service.default.svc.cluster.local"
+  restartPolicy: Never
diff --git a/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml
new file mode 100644
index 00000000..d5ba8bae
--- /dev/null
+++ b/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B-Instruct/values.yaml
@@ -0,0 +1,57 @@
+"client":
+  "bash_command": |-
+    while ! curl http://${SERVER_HOSTNAME}:8000/ping; do sleep 30 && echo 'Waiting for server...'; done
+
+    vllm bench serve \
+    --dataset-name=random \
+    --random-input-len=512 \
+    --random-output-len=1024 \
+    --num-prompts=1000 \
+    --host=${SERVER_HOSTNAME} \
+    --port=8000 \
+    --model=Qwen/Qwen3-Coder-480B-A35B-Instruct \
+    --max-concurrency=64
+  "image": |-
+    vllm/vllm-tpu:nightly-ironwood-20260224-487a8c1-f91808a
+"gcp_service_account": |-
+  <YOUR_GCP_SERVICE_ACCOUNT>
+"hf_token": !!null |-
+  null
+"is_multi_host": !!bool |-
+  true
+"k8s_service_account": |-
+  vllm-sa
+"server":
+  "bash_command": |-
+    bash /workspace/vllm/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
+    VLLM_DISABLE_SHARED_EXPERTS_STREAM=1 python3 -m vllm.entrypoints.openai.api_server \
+    --host=0.0.0.0 \
+    --port=8000 \
+    --download-dir=/data \
+    --tensor-parallel-size=16 \
+    --max-model-len=4096 \
+    --load-format=runai_streamer \
+    --kv-cache-dtype=fp8 \
+    --gpu-memory-utilization=0.93 \
+    --data-parallel-size=1 \
+    --max-num-batched-tokens=2048 \
+    --max-num-seqs=512 \
+    --model=<MODEL_ID> \
+    --served-model-name=Qwen/Qwen3-Coder-480B-A35B-Instruct \
+    --no-enable-prefix-caching \
+    --no-async-scheduling \
+    --enable-expert-parallel
+  "data_disk_size": |-
+    100Gi
+  "image": |-
+    vllm/vllm-tpu:nightly-ironwood-20260224-487a8c1-f91808a
+  "model_impl_type": |-
+    vllm
+  "num_chips_per_node": !!int |-
+    4
+  "num_nodes": !!int |-
+    2
+  "topology": |-
+    2x2x2
+  "worker_bash_command": |-
+    bash /workspace/vllm/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)