From c887b5ec9a638de31161e8404e510bdd242bbc9f Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 05:17:20 -0500
Subject: [PATCH 01/10] Add the Deepseek-V4-Pro supported on MI355x

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md             | 243 ++++++++++++++++++++++++
 models/deepseek-ai/DeepSeek-V4-Pro.yaml |  25 ++-
 2 files changed, 266 insertions(+), 2 deletions(-)
 create mode 100644 DeepSeek/DeepSeek-V4-AMD.md

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
new file mode 100644
index 00000000..2d4073fd
--- /dev/null
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -0,0 +1,243 @@
+# DeepSeek-V4 on AMD (ROCm) Usage Guide
+
+This recipe mirrors the official DeepSeek-V4 recipe structure and is adapted for AMD ROCm based on [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871).
+
+## Scope
+
+This guide covers:
+
+- DeepSeek-V4-Flash on MI355X (online serving)
+- DeepSeek-V4-Pro on MI355X (offline + online serving)
+- Reasoning mode usage
+- Tool calling flags
+- MTP speculative decoding (experimental recommendation)
+
+## Environment and Version
+
+At the time of writing, AMD DeepSeek-V4 support is under review upstream, so use the PR branch build:
+
+```bash
+# inside ROCm container
+pip uninstall -y vllm
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git fetch origin pull/40871/head:pr_dsv4
+git checkout pr_dsv4
+python3 setup.py develop
+```
+
+Reference runtime used in PR validation:
+
+- Docker image: `rocm/vllm-dev:nightly_main_20260423`
+- Hardware: `MI355X`
+
+## DeepSeek-V4-Flash (MI355X)
+
+### Launch
+
+```bash
+max_num_seqs=16
+max_num_batched_tokens=1024
+tensor_parallel_size=4
+
+export HF_HOME=/data/huggingface-cache
+export VLLM_ROCM_USE_AITER=1
+export VLLM_TORCH_PROFILER_DIR=/app/vllm_profile
+
+MODEL=/home/models/DeepSeek-V4-Flash
+vllm serve ${MODEL} \
+  --host localhost \
+  --port 8001 \
+  --dtype auto \
+  --tensor-parallel-size ${tensor_parallel_size} \
+  --max-num-seqs ${max_num_seqs} \
+  --max-num-batched-tokens ${max_num_batched_tokens} \
+  --distributed-executor-backend mp \
+  --trust-remote-code \
+  --profiler-config '{"profiler":"torch","torch_profiler_dir":"./vllm_profile"}' \
+  --gpu-memory-utilization 0.35 \
+  --moe-backend triton_unfused \
+  --tokenizer-mode deepseek_v4 \
+  --async-scheduling \
+  --enforce-eager
+```
+
+### Smoke test
+
+```bash
+curl -s http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Write me a poem about AMD and DeepSeek.",
+    "model": "/home/models/DeepSeek-V4-Flash",
+    "max_tokens": 100,
+    "temperature": 0.0
+  }'
+```
+
+### Accuracy check (GSM8K, from PR)
+
+```bash
+MODEL=/home/models/DeepSeek-V4-Flash
+lm_eval --model local-completions \
+  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \
+  --batch_size auto \
+  --tasks gsm8k \
+  --num_fewshot 8 \
+  --output_path .
+```
+
+Reported result:
+
+- `flexible-extract exact_match`: `0.9439`
+- `strict-match exact_match`: `0.9431`
+
+## DeepSeek-V4-Pro (MI355X)
+
+### Offline validation
+
+```python
+import os
+from vllm import LLM, SamplingParams
+
+os.environ["VLLM_ROCM_USE_AITER"] = "1"
+os.environ["VLLM_ROCM_USE_AITER_LINEAR"] = "1"
+
+prompts = ["What is 2+2? Answer:", "The capital of France is "]
+sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=20)
+
+llm = LLM(
+    model="/home/models/DeepSeek-V4-Pro",
+    tensor_parallel_size=8,
+    kv_cache_dtype="fp8",
+    gpu_memory_utilization=0.6,
+    async_scheduling=True,
+    enforce_eager=True,
+    disable_log_stats=False,
+    tokenizer_mode="deepseek_v4",
+    moe_backend="triton_unfused",
+    reasoning_parser="deepseek_v4",
+)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    print(output.prompt, output.outputs[0].text)
+```
+
+### Online serving
+
+```bash
+max_num_seqs=128
+max_num_batched_tokens=8192
+tensor_parallel_size=8
+
+export HF_HOME=/data/huggingface-cache
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_LINEAR=1
+rm -rf /root/.cache/vllm/torch_compile_cache
+
+MODEL=/home/models/DeepSeek-V4-Pro
+vllm serve ${MODEL} \
+  --host localhost \
+  --port 8001 \
+  --dtype auto \
+  --kv-cache-dtype fp8 \
+  --tensor-parallel-size ${tensor_parallel_size} \
+  --max-num-seqs ${max_num_seqs} \
+  --max-num-batched-tokens ${max_num_batched_tokens} \
+  --distributed-executor-backend mp \
+  --trust-remote-code \
+  --gpu-memory-utilization 0.6 \
+  --moe-backend triton_unfused \
+  --tokenizer-mode deepseek_v4 \
+  --reasoning-parser deepseek_v4 \
+  --async-scheduling \
+  --enforce-eager
+```
+
+### Accuracy check (GSM8K, from PR)
+
+```bash
+MODEL=/home/models/DeepSeek-V4-Pro
+lm_eval --model local-completions \
+  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
+  --batch_size auto \
+  --tasks gsm8k \
+  --num_fewshot 8 \
+  --output_path .
+```
+
+Reported result:
+
+- `flexible-extract exact_match`: `0.9538`
+- `strict-match exact_match`: `0.9545`
+
+## Reasoning modes
+
+DeepSeek-V4 exposes non-think / think-high / think-max via `chat_template_kwargs`.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8001/v1", api_key="EMPTY")
+model = "deepseek-ai/DeepSeek-V4-Pro"
+messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}]
+
+# Non-think
+client.chat.completions.create(model=model, messages=messages)
+
+# Think high
+client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "high"}},
+)
+
+# Think max (ensure sufficient max-model-len)
+client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "max"}},
+)
+```
+
+## Tool calling
+
+Add these arguments to your serve command:
+
+```bash
+--tokenizer-mode deepseek_v4 \
+--tool-call-parser deepseek_v4 \
+--enable-auto-tool-choice
+```
+
+## Speculative decoding (MTP)
+
+DeepSeek-V4 has native MTP support. On AMD, start conservatively and tune:
+
+```bash
+--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
+```
+
+If memory/throughput allows, test:
+
+```bash
+--speculative-config '{"method":"mtp","num_speculative_tokens":2}'
+```
+
+## ROCm-specific notes from PR #40871
+
+- ROCm path includes DeepSeek-V4 FP8 compatibility updates and E8M0 scale handling.
+- ROCm execution disables some multi-stream paths to avoid known hang scenarios.
+- For DeepSeek-V4 routing mode, `triton_unfused` is preferred for accuracy, with AITER as fallback.
+
+## Troubleshooting
+
+1. **`NotImplementedError: "mul_cuda" not implemented for 'Float8_e8m0fnu'`**
+   - Ensure you are using the PR build above (or a newer commit that includes ROCm E8M0 handling fixes).
+2. **Model hangs during startup/load**
+   - Keep `--enforce-eager` enabled.
+   - Use `--moe-backend triton_unfused` on AMD.
+3. **Tokenizer / reasoning mismatch**
+   - Verify `--tokenizer-mode deepseek_v4` and `--reasoning-parser deepseek_v4` are both set.
+
diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
index 8312008d..c2436e04 100644
--- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
@@ -3,7 +3,7 @@ meta:
   slug: "deepseek-v4-pro"
   provider: "DeepSeek"
   description: "DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning."
-  date_updated: 2026-04-24
+  date_updated: 2026-05-01
   difficulty: hard
   tasks:
     - text
@@ -17,7 +17,7 @@ meta:
     gb300: verified
     mi300x: unsupported
     mi325x: unsupported
-    mi355x: unsupported
+    mi355x: verified
 
 model:
   model_id: "deepseek-ai/DeepSeek-V4-Pro"
@@ -109,6 +109,23 @@ hardware_overrides:
       - "--attention_config.use_fp4_indexer_cache=True"
       - "--moe-backend"
       - "deep_gemm_mega_moe"
+  amd:
+    extra_args:
+      - "--distributed-executor-backend"
+      - "mp"
+      - "--gpu-memory-utilization"
+      - "0.6"
+      - "--max-num-seqs"
+      - "128"
+      - "--max-num-batched-tokens"
+      - "8192"
+      - "--moe-backend"
+      - "triton_unfused"
+      - "--async-scheduling"
+      - "--enforce-eager"
+    extra_env:
+      VLLM_ROCM_USE_AITER: "1"
+      VLLM_ROCM_USE_AITER_LINEAR: "1"
 
 strategy_overrides:
   single_node_tp:
@@ -252,6 +269,10 @@ guide: |
   - **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at
     800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params
     replicated across ranks — applies to both single-node and multi-node H200.
+  - **MI355X (8× GPU)**: validated with ROCm + AITER (`VLLM_ROCM_USE_AITER=1`,
+    `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`,
+    `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`,
+    `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`.
   - **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not
     fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with
     `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2.

From fcd1e7e18d0c4086f95ee33ce64dfcbabcbb6206 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 05:24:01 -0500
Subject: [PATCH 02/10] Update the Deepseek-V4-Flash support

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 models/deepseek-ai/DeepSeek-V4-Flash.yaml | 25 +++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
index 6bf204ed..b1064048 100644
--- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
@@ -3,7 +3,7 @@ meta:
   slug: "deepseek-v4-flash"
   provider: "DeepSeek"
   description: "DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max)."
-  date_updated: 2026-04-24
+  date_updated: 2026-05-01
   difficulty: hard
   tasks:
     - text
@@ -17,7 +17,7 @@ meta:
     gb300: verified
     mi300x: unsupported
     mi325x: unsupported
-    mi355x: unsupported
+    mi355x: verified
 
 model:
   model_id: "deepseek-ai/DeepSeek-V4-Flash"
@@ -91,6 +91,22 @@ hardware_overrides:
       - "--attention_config.use_fp4_indexer_cache=True"
       - "--moe-backend"
       - "deep_gemm_mega_moe"
+  amd:
+    extra_args:
+      - "--distributed-executor-backend"
+      - "mp"
+      - "--gpu-memory-utilization"
+      - "0.35"
+      - "--max-num-seqs"
+      - "16"
+      - "--max-num-batched-tokens"
+      - "1024"
+      - "--moe-backend"
+      - "triton_unfused"
+      - "--async-scheduling"
+      - "--enforce-eager"
+    extra_env:
+      VLLM_ROCM_USE_AITER: "1"
 
 strategy_overrides:
   single_node_tp:
@@ -228,6 +244,11 @@ guide: |
   replica on H200/B200/B300 (leaving headroom for throughput-vs-latency tuning).
   For disaggregated prefill/decode on GB200, use the PD Cluster tab.
 
+  On **MI355X (8×288GB)**, validation used ROCm + AITER (`VLLM_ROCM_USE_AITER=1`),
+  `--distributed-executor-backend mp`, `--gpu-memory-utilization 0.35`,
+  `--max-num-seqs 16`, `--max-num-batched-tokens 1024`,
+  `--moe-backend triton_unfused`, `--async-scheduling`, and `--enforce-eager`.
+
   ### H200 Single-Node PD (Mooncake)
 
   Single-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node,

From 2c8b04b9f7fc4c22ea0e0a87fb4f749172b4f479 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 05:33:43 -0500
Subject: [PATCH 03/10] Update the feature matrix

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md | 299 +++++++++++++-----------------------
 1 file changed, 108 insertions(+), 191 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 2d4073fd..529538cf 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -1,150 +1,131 @@
 # DeepSeek-V4 on AMD (ROCm) Usage Guide
 
-This recipe mirrors the official DeepSeek-V4 recipe structure and is adapted for AMD ROCm based on [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871).
+This page is aligned with the DeepSeek-V4-Pro recipe layout on recipes.vllm.ai and
+captures the AMD MI355X validated settings from [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871).
 
-## Scope
+## Overview
 
-This guide covers:
+DeepSeek-V4-Pro is the flagship of the V4 preview family: a 1.6T-total / 49B-active
+Mixture-of-Experts model. It pairs a **hybrid attention stack** — Compressed Sparse
+Attention (CSA) + Heavily Compressed Attention (HCA) — with **Manifold-Constrained
+Hyper-Connections (mHC)** to reach 27% of V3.2's per-token inference FLOPs and 10% of
+V3.2's KV cache at 1M context. Pre-trained on 32T+ tokens with the **Muon optimizer**
+for faster convergence; post-training is a two-stage pipeline (domain-specific expert
+cultivation + unified consolidation via on-policy distillation).
 
-- DeepSeek-V4-Flash on MI355X (online serving)
-- DeepSeek-V4-Pro on MI355X (offline + online serving)
-- Reasoning mode usage
-- Tool calling flags
-- MTP speculative decoding (experimental recommendation)
+Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the
+remaining (attention / norm / router) params stay in FP8.
 
-## Environment and Version
+## Reasoning modes
 
-At the time of writing, AMD DeepSeek-V4 support is under review upstream, so use the PR branch build:
+The chat template exposes three reasoning-effort modes:
 
-```bash
-# inside ROCm container
-pip uninstall -y vllm
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-git fetch origin pull/40871/head:pr_dsv4
-git checkout pr_dsv4
-python3 setup.py develop
-```
+- **Non-think** — fast, intuitive responses.
+- **Think High** — explicit chain-of-thought for complex problem-solving and planning.
+- **Think Max** — maximum reasoning effort; requires `--max-model-len >= 393216`
+  (384K tokens) to avoid truncation.
 
-Reference runtime used in PR validation:
+Recommended sampling: `temperature = 1.0`, `top_p = 1.0`.
 
-- Docker image: `rocm/vllm-dev:nightly_main_20260423`
-- Hardware: `MI355X`
+### OpenAI Client Example
 
-## DeepSeek-V4-Flash (MI355X)
+For DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a
+custom **Think Max** mode via `"reasoning_effort": "max"`.
 
-### Launch
+```python
+from openai import OpenAI
 
-```bash
-max_num_seqs=16
-max_num_batched_tokens=1024
-tensor_parallel_size=4
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
+model = "deepseek-ai/DeepSeek-V4-Pro"
+messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}]
 
-export HF_HOME=/data/huggingface-cache
-export VLLM_ROCM_USE_AITER=1
-export VLLM_TORCH_PROFILER_DIR=/app/vllm_profile
+# Non-think
+resp = client.chat.completions.create(
+    model=model,
+    messages=messages,
+)
 
-MODEL=/home/models/DeepSeek-V4-Flash
-vllm serve ${MODEL} \
-  --host localhost \
-  --port 8001 \
-  --dtype auto \
-  --tensor-parallel-size ${tensor_parallel_size} \
-  --max-num-seqs ${max_num_seqs} \
-  --max-num-batched-tokens ${max_num_batched_tokens} \
-  --distributed-executor-backend mp \
-  --trust-remote-code \
-  --profiler-config '{"profiler":"torch","torch_profiler_dir":"./vllm_profile"}' \
-  --gpu-memory-utilization 0.35 \
-  --moe-backend triton_unfused \
-  --tokenizer-mode deepseek_v4 \
-  --async-scheduling \
-  --enforce-eager
+# Think High
+resp = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={
+        "chat_template_kwargs": {
+            "thinking": True,
+            "reasoning_effort": "high",
+        },
+    },
+)
+
+# Think Max
+resp = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={
+        "chat_template_kwargs": {
+            "thinking": True,
+            "reasoning_effort": "max",
+        },
+    },
+)
 ```
 
-### Smoke test
+## Recommended deployments
 
-```bash
-curl -s http://localhost:8001/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "Write me a poem about AMD and DeepSeek.",
-    "model": "/home/models/DeepSeek-V4-Flash",
-    "max_tokens": 100,
-    "temperature": 0.0
-  }'
-```
+- **B300 (8× GPU)**: single-node DP + EP with `--data-parallel-size 8`.
+- **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at
+  800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params
+  replicated across ranks — applies to both single-node and multi-node H200.
+- **MI355X (8× GPU)**: validated with ROCm + AITER
+  (`VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`,
+  `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`,
+  `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`.
+- **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not
+  fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with
+  `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2.
 
-### Accuracy check (GSM8K, from PR)
+## Feature matrix
 
-```bash
-MODEL=/home/models/DeepSeek-V4-Flash
-lm_eval --model local-completions \
-  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \
-  --batch_size auto \
-  --tasks gsm8k \
-  --num_fewshot 8 \
-  --output_path .
-```
+The table below is a static equivalent of the interactive matrix shown on
+recipes.vllm.ai (hardware / variant / strategy / features).
 
-Reported result:
+| Model | Hardware | Variant | Recommended strategies | Tool calling | Reasoning | Spec decoding |
+| --- | --- | --- | --- | --- | --- | --- |
+| DeepSeek-V4-Pro | MI355X (8x288GB) | FP8 (~960GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) |
+| DeepSeek-V4-Flash | MI355X (8x288GB) | FP8 (~170GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) |
 
-- `flexible-extract exact_match`: `0.9439`
-- `strict-match exact_match`: `0.9431`
+### MI355X recommended presets
 
-## DeepSeek-V4-Pro (MI355X)
+| Model | TP | Max num seqs | Max batched tokens | GPU memory utilization | Key ROCm env |
+| --- | --- | ---: | ---: | ---: | --- |
+| DeepSeek-V4-Pro | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` |
+| DeepSeek-V4-Flash | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` |
 
-### Offline validation
+### Feature toggles
 
-```python
-import os
-from vllm import LLM, SamplingParams
-
-os.environ["VLLM_ROCM_USE_AITER"] = "1"
-os.environ["VLLM_ROCM_USE_AITER_LINEAR"] = "1"
-
-prompts = ["What is 2+2? Answer:", "The capital of France is "]
-sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=20)
-
-llm = LLM(
-    model="/home/models/DeepSeek-V4-Pro",
-    tensor_parallel_size=8,
-    kv_cache_dtype="fp8",
-    gpu_memory_utilization=0.6,
-    async_scheduling=True,
-    enforce_eager=True,
-    disable_log_stats=False,
-    tokenizer_mode="deepseek_v4",
-    moe_backend="triton_unfused",
-    reasoning_parser="deepseek_v4",
-)
+| Feature | Server args |
+| --- | --- |
+| Tool Calling | `--tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 --enable-auto-tool-choice` |
+| Reasoning | `--reasoning-parser deepseek_v4` |
+| Spec Decoding | `--speculative-config '{"method":"mtp","num_speculative_tokens":1}'` (start) / `2` (tune) |
 
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    print(output.prompt, output.outputs[0].text)
-```
+## AMD validation command snippets
 
-### Online serving
+### DeepSeek-V4-Pro (MI355X, TP=8)
 
 ```bash
-max_num_seqs=128
-max_num_batched_tokens=8192
-tensor_parallel_size=8
-
 export HF_HOME=/data/huggingface-cache
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_LINEAR=1
-rm -rf /root/.cache/vllm/torch_compile_cache
 
-MODEL=/home/models/DeepSeek-V4-Pro
-vllm serve ${MODEL} \
+vllm serve /home/models/DeepSeek-V4-Pro \
   --host localhost \
   --port 8001 \
   --dtype auto \
   --kv-cache-dtype fp8 \
-  --tensor-parallel-size ${tensor_parallel_size} \
-  --max-num-seqs ${max_num_seqs} \
-  --max-num-batched-tokens ${max_num_batched_tokens} \
+  --tensor-parallel-size 8 \
+  --max-num-seqs 128 \
+  --max-num-batched-tokens 8192 \
   --distributed-executor-backend mp \
   --trust-remote-code \
   --gpu-memory-utilization 0.6 \
@@ -155,89 +136,25 @@ vllm serve ${MODEL} \
   --enforce-eager
 ```
 
-### Accuracy check (GSM8K, from PR)
-
-```bash
-MODEL=/home/models/DeepSeek-V4-Pro
-lm_eval --model local-completions \
-  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
-  --batch_size auto \
-  --tasks gsm8k \
-  --num_fewshot 8 \
-  --output_path .
-```
-
-Reported result:
-
-- `flexible-extract exact_match`: `0.9538`
-- `strict-match exact_match`: `0.9545`
-
-## Reasoning modes
-
-DeepSeek-V4 exposes non-think / think-high / think-max via `chat_template_kwargs`.
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8001/v1", api_key="EMPTY")
-model = "deepseek-ai/DeepSeek-V4-Pro"
-messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}]
-
-# Non-think
-client.chat.completions.create(model=model, messages=messages)
-
-# Think high
-client.chat.completions.create(
-    model=model,
-    messages=messages,
-    extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "high"}},
-)
-
-# Think max (ensure sufficient max-model-len)
-client.chat.completions.create(
-    model=model,
-    messages=messages,
-    extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "max"}},
-)
-```
-
-## Tool calling
-
-Add these arguments to your serve command:
+### DeepSeek-V4-Flash (MI355X, TP=4)
 
 ```bash
---tokenizer-mode deepseek_v4 \
---tool-call-parser deepseek_v4 \
---enable-auto-tool-choice
-```
-
-## Speculative decoding (MTP)
-
-DeepSeek-V4 has native MTP support. On AMD, start conservatively and tune:
-
-```bash
---speculative-config '{"method":"mtp","num_speculative_tokens":1}'
-```
-
-If memory/throughput allows, test:
+export HF_HOME=/data/huggingface-cache
+export VLLM_ROCM_USE_AITER=1
 
-```bash
---speculative-config '{"method":"mtp","num_speculative_tokens":2}'
+vllm serve /home/models/DeepSeek-V4-Flash \
+  --host localhost \
+  --port 8001 \
+  --dtype auto \
+  --tensor-parallel-size 4 \
+  --max-num-seqs 16 \
+  --max-num-batched-tokens 1024 \
+  --distributed-executor-backend mp \
+  --trust-remote-code \
+  --gpu-memory-utilization 0.35 \
+  --moe-backend triton_unfused \
+  --tokenizer-mode deepseek_v4 \
+  --async-scheduling \
+  --enforce-eager
 ```
 
-## ROCm-specific notes from PR #40871
-
-- ROCm path includes DeepSeek-V4 FP8 compatibility updates and E8M0 scale handling.
-- ROCm execution disables some multi-stream paths to avoid known hang scenarios.
-- For DeepSeek-V4 routing mode, `triton_unfused` is preferred for accuracy, with AITER as fallback.
-
-## Troubleshooting
-
-1. **`NotImplementedError: "mul_cuda" not implemented for 'Float8_e8m0fnu'`**
-   - Ensure you are using the PR build above (or a newer commit that includes ROCm E8M0 handling fixes).
-2. **Model hangs during startup/load**
-   - Keep `--enforce-eager` enabled.
-   - Use `--moe-backend triton_unfused` on AMD.
-3. **Tokenizer / reasoning mismatch**
-   - Verify `--tokenizer-mode deepseek_v4` and `--reasoning-parser deepseek_v4` are both set.
-

From 038809e332e646453d777dcd325eb2b294de094d Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 05:39:41 -0500
Subject: [PATCH 04/10] Update the recipe

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md | 160 +++++++++++++++++++++---------------
 1 file changed, 94 insertions(+), 66 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 529538cf..a3d514a8 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -16,73 +16,12 @@ cultivation + unified consolidation via on-policy distillation).
 Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the
 remaining (attention / norm / router) params stay in FP8.
 
-## Reasoning modes
-
-The chat template exposes three reasoning-effort modes:
-
-- **Non-think** — fast, intuitive responses.
-- **Think High** — explicit chain-of-thought for complex problem-solving and planning.
-- **Think Max** — maximum reasoning effort; requires `--max-model-len >= 393216`
-  (384K tokens) to avoid truncation.
-
-Recommended sampling: `temperature = 1.0`, `top_p = 1.0`.
-
-### OpenAI Client Example
-
-For DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a
-custom **Think Max** mode via `"reasoning_effort": "max"`.
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
-model = "deepseek-ai/DeepSeek-V4-Pro"
-messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}]
-
-# Non-think
-resp = client.chat.completions.create(
-    model=model,
-    messages=messages,
-)
-
-# Think High
-resp = client.chat.completions.create(
-    model=model,
-    messages=messages,
-    extra_body={
-        "chat_template_kwargs": {
-            "thinking": True,
-            "reasoning_effort": "high",
-        },
-    },
-)
-
-# Think Max
-resp = client.chat.completions.create(
-    model=model,
-    messages=messages,
-    extra_body={
-        "chat_template_kwargs": {
-            "thinking": True,
-            "reasoning_effort": "max",
-        },
-    },
-)
-```
-
 ## Recommended deployments
 
-- **B300 (8× GPU)**: single-node DP + EP with `--data-parallel-size 8`.
-- **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at
-  800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params
-  replicated across ranks — applies to both single-node and multi-node H200.
 - **MI355X (8× GPU)**: validated with ROCm + AITER
   (`VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`,
   `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`,
   `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`.
-- **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not
-  fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with
-  `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2.
 
 ## Feature matrix
 
@@ -91,15 +30,15 @@ recipes.vllm.ai (hardware / variant / strategy / features).
 
 | Model | Hardware | Variant | Recommended strategies | Tool calling | Reasoning | Spec decoding |
 | --- | --- | --- | --- | --- | --- | --- |
-| DeepSeek-V4-Pro | MI355X (8x288GB) | FP8 (~960GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) |
-| DeepSeek-V4-Flash | MI355X (8x288GB) | FP8 (~170GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) |
+| [DeepSeek-V4-Pro](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) | MI355X (8x288GB) | FP8 (~960GB) | Tensor Parallel (TP) | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | No (`false`) |
+| [DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | MI355X (8x288GB) | FP8 (~170GB) | Tensor Parallel (TP) | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | No (`false`) |
 
 ### MI355X recommended presets
 
 | Model | TP | Max num seqs | Max batched tokens | GPU memory utilization | Key ROCm env |
 | --- | --- | ---: | ---: | ---: | --- |
-| DeepSeek-V4-Pro | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` |
-| DeepSeek-V4-Flash | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` |
+| [DeepSeek-V4-Pro](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` |
+| [DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` |
 
 ### Feature toggles
 
@@ -107,7 +46,7 @@ recipes.vllm.ai (hardware / variant / strategy / features).
 | --- | --- |
 | Tool Calling | `--tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 --enable-auto-tool-choice` |
 | Reasoning | `--reasoning-parser deepseek_v4` |
-| Spec Decoding | `--speculative-config '{"method":"mtp","num_speculative_tokens":1}'` (start) / `2` (tune) |
+| Spec Decoding | Disabled (`false`) |
 
 ## AMD validation command snippets
 
@@ -158,3 +97,92 @@ vllm serve /home/models/DeepSeek-V4-Flash \
   --enforce-eager
 ```
 
+## Smoke test (single request)
+
+### DeepSeek-V4-Flash
+
+```bash
+curl -s http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Write me a poem about AMD and DeepSeek",
+    "model": "/home/models/DeepSeek-V4-Flash",
+    "max_tokens": 100,
+    "temperature": 0.0
+  }'
+```
+
+Sample result from PR validation (truncated):
+
+```json
+{
+  "object": "text_completion",
+  "model": "/home/models/DeepSeek-V4-Flash",
+  "choices": [
+    {
+      "finish_reason": "length",
+      "text": "\"... Here's a poem about AMD and DeepSeek: ...\""
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 9,
+    "completion_tokens": 100,
+    "total_tokens": 109
+  }
+}
+```
+
+### DeepSeek-V4-Pro
+
+```bash
+curl -s http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is 2+2? Return only the final integer.",
+    "model": "/home/models/DeepSeek-V4-Pro",
+    "max_tokens": 16,
+    "temperature": 0.0
+  }'
+```
+
+Smoke-test success criteria:
+
+- HTTP status is `200`
+- `choices[0].text` is non-empty
+
+## GSM8K validation (command + result)
+
+### DeepSeek-V4-Flash
+
+```bash
+MODEL=/home/models/DeepSeek-V4-Flash
+lm_eval --model local-completions \
+  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \
+  --batch_size auto \
+  --tasks gsm8k \
+  --num_fewshot 8 \
+  --output_path .
+```
+
+Reported result from PR #40871:
+
+- `flexible-extract exact_match`: `0.9439`
+- `strict-match exact_match`: `0.9431`
+
+### DeepSeek-V4-Pro
+
+```bash
+MODEL=/home/models/DeepSeek-V4-Pro
+lm_eval --model local-completions \
+  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
+  --batch_size auto \
+  --tasks gsm8k \
+  --num_fewshot 8 \
+  --output_path .
+```
+
+Reported result from PR #40871:
+
+- `flexible-extract exact_match`: `0.9538`
+- `strict-match exact_match`: `0.9545`
+

From 3d41f2ee5b1a7ecb9e5bbd7d6bc2e395557b6af3 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 07:41:30 -0500
Subject: [PATCH 05/10] Update the smoking test result

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md               | 108 +++++++++-------------
 models/deepseek-ai/DeepSeek-V4-Flash.yaml |  46 +++++++++
 models/deepseek-ai/DeepSeek-V4-Pro.yaml   |  49 ++++++++++
 3 files changed, 141 insertions(+), 62 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index a3d514a8..9ce74980 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -48,9 +48,9 @@ recipes.vllm.ai (hardware / variant / strategy / features).
 | Reasoning | `--reasoning-parser deepseek_v4` |
 | Spec Decoding | Disabled (`false`) |
 
-## AMD validation command snippets
+## DeepSeek-V4-Pro validation (MI355X, TP=8)
 
-### DeepSeek-V4-Pro (MI355X, TP=8)
+### 1) Serve command
 
 ```bash
 export HF_HOME=/data/huggingface-cache
@@ -75,7 +75,44 @@ vllm serve /home/models/DeepSeek-V4-Pro \
   --enforce-eager
 ```
 
-### DeepSeek-V4-Flash (MI355X, TP=4)
+### 2) Smoke test (single request)
+
+```bash
+curl -s http://localhost:8001/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is 2+2? Return only the final integer.",
+    "model": "/home/models/DeepSeek-V4-Pro",
+    "max_tokens": 16,
+    "temperature": 0.0
+  }'
+```
+
+Smoke-test success criteria:
+
+- HTTP status is `200`
+- `choices[0].text` is non-empty
+
+### 3) GSM8K validation
+
+```bash
+MODEL=/home/models/DeepSeek-V4-Pro
+lm_eval --model local-completions \
+  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
+  --batch_size auto \
+  --tasks gsm8k \
+  --num_fewshot 8 \
+  --output_path .
+```
+
+Reported result from PR #40871:
+
+- `flexible-extract exact_match`: `0.9538`
+- `strict-match exact_match`: `0.9545`
+
+## DeepSeek-V4-Flash validation (MI355X, TP=4)
+
+### 1) Serve command
 
 ```bash
 export HF_HOME=/data/huggingface-cache
@@ -97,62 +134,26 @@ vllm serve /home/models/DeepSeek-V4-Flash \
   --enforce-eager
 ```
 
-## Smoke test (single request)
-
-### DeepSeek-V4-Flash
+### 2) Smoke test (single request)
 
 ```bash
 curl -s http://localhost:8001/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "prompt": "Write me a poem about AMD and DeepSeek",
-    "model": "/home/models/DeepSeek-V4-Flash",
+    "prompt": "Introduce the capital of US",
+    "model": "/models/DeepSeek-V4-Flash",
     "max_tokens": 100,
     "temperature": 0.0
   }'
 ```
 
-Sample result from PR validation (truncated):
+Sample result:
 
 ```json
-{
-  "object": "text_completion",
-  "model": "/home/models/DeepSeek-V4-Flash",
-  "choices": [
-    {
-      "finish_reason": "length",
-      "text": "\"... Here's a poem about AMD and DeepSeek: ...\""
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 9,
-    "completion_tokens": 100,
-    "total_tokens": 109
-  }
-}
+{"id":"cmpl-86e0959d4415d914","object":"text_completion","created":1777638722,"model":"/models/DeepSeek-V4-Flash","choices":[{"index":0,"text":"\",\"answer\":\"Washington, D.C.\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Canada.\",\"answer\":\"Ottawa\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Mexico.\",\"answer\":\"Mexico City\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Brazil.\",\"answer\":\"Brasília\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Argentina.\",\"answer\":\"Buenos Aires\",\"type\":\"text\"},{\"question\":\"Introdu","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp4-015676fd","usage":{"prompt_tokens":7,"total_tokens":107,"completion_tokens":100,"prompt_tokens_details":null},"kv_transfer_params":null}
 ```
 
-### DeepSeek-V4-Pro
-
-```bash
-curl -s http://localhost:8001/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "What is 2+2? Return only the final integer.",
-    "model": "/home/models/DeepSeek-V4-Pro",
-    "max_tokens": 16,
-    "temperature": 0.0
-  }'
-```
-
-Smoke-test success criteria:
-
-- HTTP status is `200`
-- `choices[0].text` is non-empty
-
-## GSM8K validation (command + result)
-
-### DeepSeek-V4-Flash
+### 3) GSM8K validation
 
 ```bash
 MODEL=/home/models/DeepSeek-V4-Flash
@@ -169,20 +170,3 @@ Reported result from PR #40871:
 - `flexible-extract exact_match`: `0.9439`
 - `strict-match exact_match`: `0.9431`
 
-### DeepSeek-V4-Pro
-
-```bash
-MODEL=/home/models/DeepSeek-V4-Pro
-lm_eval --model local-completions \
-  --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
-  --batch_size auto \
-  --tasks gsm8k \
-  --num_fewshot 8 \
-  --output_path .
-```
-
-Reported result from PR #40871:
-
-- `flexible-extract exact_match`: `0.9538`
-- `strict-match exact_match`: `0.9545`
-
diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
index b1064048..235c1435 100644
--- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
@@ -249,6 +249,52 @@ guide: |
   `--max-num-seqs 16`, `--max-num-batched-tokens 1024`,
   `--moe-backend triton_unfused`, `--async-scheduling`, and `--enforce-eager`.
 
+  ## GSM8K validation (MI355X)
+
+  Launch command (TP=4):
+
+  ```bash
+  export HF_HOME=/data/huggingface-cache
+  export VLLM_ROCM_USE_AITER=1
+
+  vllm serve /home/models/DeepSeek-V4-Flash \
+    --host localhost \
+    --port 8001 \
+    --dtype auto \
+    --tensor-parallel-size 4 \
+    --max-num-seqs 16 \
+    --max-num-batched-tokens 1024 \
+    --distributed-executor-backend mp \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.35 \
+    --moe-backend triton_unfused \
+    --tokenizer-mode deepseek_v4 \
+    --async-scheduling \
+    --enforce-eager
+  ```
+
+  GSM8K command:
+
+  ```bash
+  MODEL=/home/models/DeepSeek-V4-Flash
+  lm_eval --model local-completions \
+    --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \
+    --batch_size auto \
+    --tasks gsm8k \
+    --num_fewshot 8 \
+    --output_path . 2>&1 | tee -a eval.log
+  ```
+
+  Reported result from PR #40871:
+
+  ```text
+  local-completions ({'model': '/home/models/DeepSeek-V4-Flash', 'base_url': 'http://0.0.0.0:8001/v1/completions', 'num_concurrent': 4, 'max_retries': 10, 'max_gen_toks': 2048, 'timeout': 60000}), gen_kwargs: ({}), limit: None, num_fewshot: 8, batch_size: auto
+  |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+  |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+  |gsm8k|      3|flexible-extract|     8|exact_match|↑  |0.9439|±  |0.0063|
+  |     |       |strict-match    |     8|exact_match|↑  |0.9431|±  |0.0064|
+  ```
+
   ### H200 Single-Node PD (Mooncake)
 
   Single-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node,
diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
index c2436e04..a68354ea 100644
--- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
@@ -276,3 +276,52 @@ guide: |
   - **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not
     fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with
     `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2.
+
+  ## GSM8K validation (MI355X)
+
+  Launch command (TP=8):
+
+  ```bash
+  export HF_HOME=/data/huggingface-cache
+  export VLLM_ROCM_USE_AITER=1
+  export VLLM_ROCM_USE_AITER_LINEAR=1
+
+  vllm serve /home/models/DeepSeek-V4-Pro \
+    --host localhost \
+    --port 8001 \
+    --dtype auto \
+    --kv-cache-dtype fp8 \
+    --tensor-parallel-size 8 \
+    --max-num-seqs 128 \
+    --max-num-batched-tokens 8192 \
+    --distributed-executor-backend mp \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.6 \
+    --moe-backend triton_unfused \
+    --tokenizer-mode deepseek_v4 \
+    --reasoning-parser deepseek_v4 \
+    --async-scheduling \
+    --enforce-eager
+  ```
+
+  GSM8K command:
+
+  ```bash
+  MODEL=/home/models/DeepSeek-V4-Pro
+  lm_eval --model local-completions \
+    --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \
+    --batch_size auto \
+    --tasks gsm8k \
+    --num_fewshot 8 \
+    --output_path . 2>&1 | tee -a eval.log
+  ```
+
+  Reported result from PR #40871:
+
+  ```text
+  local-completions ({'model': '/home/models/DeepSeek-V4-Pro', 'base_url': 'http://0.0.0.0:8001/v1/completions', 'num_concurrent': 2, 'max_retries': 10, 'max_gen_toks': 2048, 'timeout': 60000}), gen_kwargs: ({}), limit: None, num_fewshot: 8, batch_size: auto
+  |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+  |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+  |gsm8k|      3|flexible-extract|     8|exact_match|↑  |0.9538|±  |0.0058|
+  |     |       |strict-match    |     8|exact_match|↑  |0.9545|±  |0.0057|
+  ```

From 9271152da524b69bee81e95c2c93e4374ffd2ee1 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 08:12:25 -0500
Subject: [PATCH 06/10] Update the gms8k result

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 9ce74980..60925974 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -88,6 +88,12 @@ curl -s http://localhost:8001/v1/completions \
   }'
 ```
 
+Sample result:
+
+```json
+{"id":"cmpl-973e09361657d259","object":"text_completion","created":1777640598,"model":"/models/DeepSeek-V4-Pro","choices":[{"index":0,"text":" Do not include any other text or explanation. The answer is 4.\nWhat","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp8-868a6cb7","usage":{"prompt_tokens":13,"total_tokens":29,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null}
+```
+
 Smoke-test success criteria:
 
 - HTTP status is `200`
@@ -107,8 +113,12 @@ lm_eval --model local-completions \
 
 Reported result from PR #40871:
 
-- `flexible-extract exact_match`: `0.9538`
-- `strict-match exact_match`: `0.9545`
+```text
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     8|exact_match|↑  |0.9538|±  |0.0058|
+|     |       |strict-match    |     8|exact_match|↑  |0.9545|±  |0.0057|
+```
 
 ## DeepSeek-V4-Flash validation (MI355X, TP=4)
 
@@ -167,6 +177,10 @@ lm_eval --model local-completions \
 
 Reported result from PR #40871:
 
-- `flexible-extract exact_match`: `0.9439`
-- `strict-match exact_match`: `0.9431`
+```text
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     8|exact_match|↑  |0.9439|±  |0.0063|
+|     |       |strict-match    |     8|exact_match|↑  |0.9431|±  |0.0064|
+```
 

From 0e0ada6c90d1b1515ecb3027cf76bbaba2509560 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 08:19:11 -0500
Subject: [PATCH 07/10] Remove the smoke result

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md | 47 ++-----------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 60925974..cd133bfa 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -75,31 +75,7 @@ vllm serve /home/models/DeepSeek-V4-Pro \
   --enforce-eager
 ```
 
-### 2) Smoke test (single request)
-
-```bash
-curl -s http://localhost:8001/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "What is 2+2? Return only the final integer.",
-    "model": "/home/models/DeepSeek-V4-Pro",
-    "max_tokens": 16,
-    "temperature": 0.0
-  }'
-```
-
-Sample result:
-
-```json
-{"id":"cmpl-973e09361657d259","object":"text_completion","created":1777640598,"model":"/models/DeepSeek-V4-Pro","choices":[{"index":0,"text":" Do not include any other text or explanation. The answer is 4.\nWhat","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp8-868a6cb7","usage":{"prompt_tokens":13,"total_tokens":29,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null}
-```
-
-Smoke-test success criteria:
-
-- HTTP status is `200`
-- `choices[0].text` is non-empty
-
-### 3) GSM8K validation
+### 2) GSM8K validation
 
 ```bash
 MODEL=/home/models/DeepSeek-V4-Pro
@@ -144,26 +120,7 @@ vllm serve /home/models/DeepSeek-V4-Flash \
   --enforce-eager
 ```
 
-### 2) Smoke test (single request)
-
-```bash
-curl -s http://localhost:8001/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "Introduce the capital of US",
-    "model": "/models/DeepSeek-V4-Flash",
-    "max_tokens": 100,
-    "temperature": 0.0
-  }'
-```
-
-Sample result:
-
-```json
-{"id":"cmpl-86e0959d4415d914","object":"text_completion","created":1777638722,"model":"/models/DeepSeek-V4-Flash","choices":[{"index":0,"text":"\",\"answer\":\"Washington, D.C.\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Canada.\",\"answer\":\"Ottawa\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Mexico.\",\"answer\":\"Mexico City\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Brazil.\",\"answer\":\"Brasília\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Argentina.\",\"answer\":\"Buenos Aires\",\"type\":\"text\"},{\"question\":\"Introdu","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp4-015676fd","usage":{"prompt_tokens":7,"total_tokens":107,"completion_tokens":100,"prompt_tokens_details":null},"kv_transfer_params":null}
-```
-
-### 3) GSM8K validation
+### 2) GSM8K validation
 
 ```bash
 MODEL=/home/models/DeepSeek-V4-Flash

From a6f74b805767ff6442823183e34ca39d40ee0cc1 Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Fri, 1 May 2026 11:36:00 -0500
Subject: [PATCH 08/10] Add docker info

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md               | 16 ++++++++++++++++
 models/deepseek-ai/DeepSeek-V4-Flash.yaml |  2 ++
 models/deepseek-ai/DeepSeek-V4-Pro.yaml   |  2 ++
 3 files changed, 20 insertions(+)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index cd133bfa..25455237 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -16,6 +16,12 @@ cultivation + unified consolidation via on-policy distillation).
 Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the
 remaining (attention / norm / router) params stay in FP8.
 
+## Docker image (AMD ROCm)
+
+```bash
+docker pull rocm/vllm-dev:deepseek-v4-latest
+```
+
 ## Recommended deployments
 
 - **MI355X (8× GPU)**: validated with ROCm + AITER
@@ -141,3 +147,13 @@ Reported result from PR #40871:
 |     |       |strict-match    |     8|exact_match|↑  |0.9431|±  |0.0064|
 ```
 
+## Related PR links
+
+- [Functionality] Base PR is functionality/accuracy ready on MI35x for both
+  DeepSeek-V4-Pro and DeepSeek-V4-Flash; lm_eval passed on full GSM8K:
+  [Ready to merge, #40871](https://github.com/vllm-project/vllm/pull/40871)
+- [Functionality] MI300 support PR:
+  [#41451](https://github.com/vllm-project/vllm/pull/41451)
+- [Performance] MLA Indexer optimization for DeepSeek-V4 and DeepSeek-V3.2 (ROCm):
+  [#41217](https://github.com/vllm-project/vllm/pull/41217)
+
diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
index 235c1435..ac8bb855 100644
--- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
@@ -22,6 +22,8 @@ meta:
 model:
   model_id: "deepseek-ai/DeepSeek-V4-Flash"
   min_vllm_version: "0.20.0"
+  docker_image:
+    amd: "rocm/vllm-dev:deepseek-v4-latest"
   architecture: moe
   parameter_count: "284B"
   active_parameters: "13B"
diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
index a68354ea..adfdcc76 100644
--- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
@@ -22,6 +22,8 @@ meta:
 model:
   model_id: "deepseek-ai/DeepSeek-V4-Pro"
   min_vllm_version: "0.20.0"
+  docker_image:
+    amd: "rocm/vllm-dev:deepseek-v4-latest"
   architecture: moe
   parameter_count: "1600B"
   active_parameters: "49B"

From 13db1970e2b11e9426937e4b80344b665531ad2f Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Wed, 6 May 2026 10:10:00 -0500
Subject: [PATCH 09/10] Update the docker to vllm/vllm-open-rocm:nightly

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md               | 2 +-
 models/deepseek-ai/DeepSeek-V4-Flash.yaml | 2 +-
 models/deepseek-ai/DeepSeek-V4-Pro.yaml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 25455237..5c2933ca 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -19,7 +19,7 @@ remaining (attention / norm / router) params stay in FP8.
 ## Docker image (AMD ROCm)
 
 ```bash
-docker pull rocm/vllm-dev:deepseek-v4-latest
+docker pull vllm/vllm-open-rocm:nightly
 ```
 
 ## Recommended deployments
diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
index ac8bb855..ac4cf33f 100644
--- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
@@ -23,7 +23,7 @@ model:
   model_id: "deepseek-ai/DeepSeek-V4-Flash"
   min_vllm_version: "0.20.0"
   docker_image:
-    amd: "rocm/vllm-dev:deepseek-v4-latest"
+    amd: "vllm/vllm-open-rocm:nightly"
   architecture: moe
   parameter_count: "284B"
   active_parameters: "13B"
diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
index adfdcc76..d65e8940 100644
--- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
@@ -23,7 +23,7 @@ model:
   model_id: "deepseek-ai/DeepSeek-V4-Pro"
   min_vllm_version: "0.20.0"
   docker_image:
-    amd: "rocm/vllm-dev:deepseek-v4-latest"
+    amd: "vllm/vllm-open-rocm:nightly"
   architecture: moe
   parameter_count: "1600B"
   active_parameters: "49B"

From d6dc5cc57037974047caa029f4f0d3e08402005b Mon Sep 17 00:00:00 2001
From: wuhuikx <hattie.wu@amd.com>
Date: Wed, 6 May 2026 21:44:22 -0500
Subject: [PATCH 10/10] Update the docker to vllm/vllm-openai-rocm:nightly

Signed-off-by: wuhuikx <hattie.wu@amd.com>
---
 DeepSeek/DeepSeek-V4-AMD.md               | 2 +-
 models/deepseek-ai/DeepSeek-V4-Flash.yaml | 2 +-
 models/deepseek-ai/DeepSeek-V4-Pro.yaml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md
index 5c2933ca..2830f09f 100644
--- a/DeepSeek/DeepSeek-V4-AMD.md
+++ b/DeepSeek/DeepSeek-V4-AMD.md
@@ -19,7 +19,7 @@ remaining (attention / norm / router) params stay in FP8.
 ## Docker image (AMD ROCm)
 
 ```bash
-docker pull vllm/vllm-open-rocm:nightly
+docker pull vllm/vllm-openai-rocm:nightly
 ```
 
 ## Recommended deployments
diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
index ac4cf33f..067fb1fe 100644
--- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml
@@ -23,7 +23,7 @@ model:
   model_id: "deepseek-ai/DeepSeek-V4-Flash"
   min_vllm_version: "0.20.0"
   docker_image:
-    amd: "vllm/vllm-open-rocm:nightly"
+    amd: "vllm/vllm-openai-rocm:nightly"
   architecture: moe
   parameter_count: "284B"
   active_parameters: "13B"
diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
index d65e8940..1c6e6adc 100644
--- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml
+++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml
@@ -23,7 +23,7 @@ model:
   model_id: "deepseek-ai/DeepSeek-V4-Pro"
   min_vllm_version: "0.20.0"
   docker_image:
-    amd: "vllm/vllm-open-rocm:nightly"
+    amd: "vllm/vllm-openai-rocm:nightly"
   architecture: moe
   parameter_count: "1600B"
   active_parameters: "49B"