From 7a3bc942eb86992f811a17f832aa254a272b261b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= <mat646@gmail.com>
Date: Tue, 19 May 2026 16:44:09 +0200
Subject: [PATCH] Include documented TPU recipes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mateusz Sokół <mat646@gmail.com>
---
 .../Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml   |  4 ++++
 models/meta-llama/Llama-3.3-70B-Instruct.yaml  | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml b/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml
index e70f6590..5d6d0ff0 100644
--- a/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml
+++ b/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml
@@ -14,6 +14,7 @@ meta:
     mi300x: verified
     mi325x: verified
     mi355x: verified
+    ironwood: verified
 
 model:
   model_id: "Qwen/Qwen3-Coder-480B-A35B-Instruct"
@@ -92,6 +93,8 @@ guide: |
 
   [Qwen3-Coder](https://github.com/QwenLM/Qwen3-Coder) is an advanced large language model created by the Qwen team. `Qwen3-Coder-480B-A35B-Instruct` is the flagship coder MoE with 480B total / 35B active parameters. vLLM supports it including tool calling; the guide below covers BF16 and FP8 serving on NVIDIA and AMD GPUs.
 
+  TPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with a recipe for [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B). The Ironwood docker command rendered by the hardware picker uses the `vllm/vllm-tpu` image; pin to the tag specified by the upstream recipe.
+
   ## Prerequisites
 
   ### CUDA
@@ -188,3 +191,4 @@ guide: |
   - [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)
   - [Qwen3-Coder GitHub](https://github.com/QwenLM/Qwen3-Coder)
   - [EvalPlus](https://github.com/evalplus/evalplus)
+  - [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B)
diff --git a/models/meta-llama/Llama-3.3-70B-Instruct.yaml b/models/meta-llama/Llama-3.3-70B-Instruct.yaml
index 4bac6dbf..3de01858 100644
--- a/models/meta-llama/Llama-3.3-70B-Instruct.yaml
+++ b/models/meta-llama/Llama-3.3-70B-Instruct.yaml
@@ -15,6 +15,7 @@ meta:
     h200: verified
     b200: verified
     gb200: verified
+    trillium: verified
 
 model:
   model_id: "meta-llama/Llama-3.3-70B-Instruct"
@@ -92,6 +93,8 @@ guide: |
   and Blackwell (B200/GB200) GPUs. FP4 is Blackwell-only and provides the best
   VRAM efficiency.
 
+  TPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with a recipe for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x).
+
   ## Prerequisites
 
   - Hardware: 1x H100/H200 (FP8), 1x B200 (FP4), or 2x GPUs for BF16
@@ -99,6 +102,20 @@ guide: |
   - CUDA Driver >= 575
   - Docker with NVIDIA Container Toolkit (recommended)
 
+  ### Docker (Cloud TPU — Trillium)
+  TPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium recipe](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x), then run:
+  ```bash
+  docker run -itd --name llama33-tpu \
+    --privileged --network host --shm-size 16G \
+    -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \
+    vllm/vllm-tpu:latest \
+      --model meta-llama/Llama-3.3-70B-Instruct \
+      --tensor-parallel-size 8 \
+      --max-model-len 16384 \
+      --host 0.0.0.0 --port 8000
+  ```
+  Trillium requires a 4-chip slice minimum.
+
   ## Client Usage
 
   ```python
@@ -125,3 +142,4 @@ guide: |
   - [Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
   - [NVIDIA FP8 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)
   - [NVIDIA FP4 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4)
+  - [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x)