From 7a3bc942eb86992f811a17f832aa254a272b261b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= Date: Tue, 19 May 2026 16:44:09 +0200 Subject: [PATCH] Include documented TPU recipes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mateusz Sokół --- .../Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml | 4 ++++ models/meta-llama/Llama-3.3-70B-Instruct.yaml | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml b/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml index e70f6590..5d6d0ff0 100644 --- a/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml +++ b/models/Qwen/Qwen3-Coder-480B-A35B-Instruct.yaml @@ -14,6 +14,7 @@ meta: mi300x: verified mi325x: verified mi355x: verified + ironwood: verified model: model_id: "Qwen/Qwen3-Coder-480B-A35B-Instruct" @@ -92,6 +93,8 @@ guide: | [Qwen3-Coder](https://github.com/QwenLM/Qwen3-Coder) is an advanced large language model created by the Qwen team. `Qwen3-Coder-480B-A35B-Instruct` is the flagship coder MoE with 480B total / 35B active parameters. vLLM supports it including tool calling; the guide below covers BF16 and FP8 serving on NVIDIA and AMD GPUs. + TPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with a recipe for [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B). The Ironwood docker command rendered by the hardware picker uses the `vllm/vllm-tpu` image; pin to the tag specified by the upstream recipe. + ## Prerequisites ### CUDA @@ -188,3 +191,4 @@ guide: | - [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8) - [Qwen3-Coder GitHub](https://github.com/QwenLM/Qwen3-Coder) - [EvalPlus](https://github.com/evalplus/evalplus) + - [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/ironwood/vLLM/Qwen3-Coder-480B-A35B) diff --git a/models/meta-llama/Llama-3.3-70B-Instruct.yaml b/models/meta-llama/Llama-3.3-70B-Instruct.yaml index 4bac6dbf..3de01858 100644 --- a/models/meta-llama/Llama-3.3-70B-Instruct.yaml +++ b/models/meta-llama/Llama-3.3-70B-Instruct.yaml @@ -15,6 +15,7 @@ meta: h200: verified b200: verified gb200: verified + trillium: verified model: model_id: "meta-llama/Llama-3.3-70B-Instruct" @@ -92,6 +93,8 @@ guide: | and Blackwell (B200/GB200) GPUs. FP4 is Blackwell-only and provides the best VRAM efficiency. + TPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with a recipe for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x). + ## Prerequisites - Hardware: 1x H100/H200 (FP8), 1x B200 (FP4), or 2x GPUs for BF16 @@ -99,6 +102,20 @@ guide: | - CUDA Driver >= 575 - Docker with NVIDIA Container Toolkit (recommended) + ### Docker (Cloud TPU — Trillium) + TPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium recipe](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x), then run: + ```bash + docker run -itd --name llama33-tpu \ + --privileged --network host --shm-size 16G \ + -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \ + vllm/vllm-tpu:latest \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --max-model-len 16384 \ + --host 0.0.0.0 --port 8000 + ``` + Trillium requires a 4-chip slice minimum. + ## Client Usage ```python @@ -125,3 +142,4 @@ guide: | - [Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) - [NVIDIA FP8 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8) - [NVIDIA FP4 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4) + - [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x)