From c887b5ec9a638de31161e8404e510bdd242bbc9f Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 05:17:20 -0500 Subject: [PATCH 01/10] Add the Deepseek-V4-Pro supported on MI355x Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 243 ++++++++++++++++++++++++ models/deepseek-ai/DeepSeek-V4-Pro.yaml | 25 ++- 2 files changed, 266 insertions(+), 2 deletions(-) create mode 100644 DeepSeek/DeepSeek-V4-AMD.md diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md new file mode 100644 index 00000000..2d4073fd --- /dev/null +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -0,0 +1,243 @@ +# DeepSeek-V4 on AMD (ROCm) Usage Guide + +This recipe mirrors the official DeepSeek-V4 recipe structure and is adapted for AMD ROCm based on [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871). + +## Scope + +This guide covers: + +- DeepSeek-V4-Flash on MI355X (online serving) +- DeepSeek-V4-Pro on MI355X (offline + online serving) +- Reasoning mode usage +- Tool calling flags +- MTP speculative decoding (experimental recommendation) + +## Environment and Version + +At the time of writing, AMD DeepSeek-V4 support is under review upstream, so use the PR branch build: + +```bash +# inside ROCm container +pip uninstall -y vllm +git clone https://github.com/vllm-project/vllm.git +cd vllm +git fetch origin pull/40871/head:pr_dsv4 +git checkout pr_dsv4 +python3 setup.py develop +``` + +Reference runtime used in PR validation: + +- Docker image: `rocm/vllm-dev:nightly_main_20260423` +- Hardware: `MI355X` + +## DeepSeek-V4-Flash (MI355X) + +### Launch + +```bash +max_num_seqs=16 +max_num_batched_tokens=1024 +tensor_parallel_size=4 + +export HF_HOME=/data/huggingface-cache +export VLLM_ROCM_USE_AITER=1 +export VLLM_TORCH_PROFILER_DIR=/app/vllm_profile + +MODEL=/home/models/DeepSeek-V4-Flash +vllm serve ${MODEL} \ + --host localhost \ + --port 8001 \ + --dtype auto \ + --tensor-parallel-size ${tensor_parallel_size} \ + --max-num-seqs ${max_num_seqs} \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --distributed-executor-backend mp \ + --trust-remote-code \ + --profiler-config '{"profiler":"torch","torch_profiler_dir":"./vllm_profile"}' \ + --gpu-memory-utilization 0.35 \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --async-scheduling \ + --enforce-eager +``` + +### Smoke test + +```bash +curl -s http://localhost:8001/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Write me a poem about AMD and DeepSeek.", + "model": "/home/models/DeepSeek-V4-Flash", + "max_tokens": 100, + "temperature": 0.0 + }' +``` + +### Accuracy check (GSM8K, from PR) + +```bash +MODEL=/home/models/DeepSeek-V4-Flash +lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . +``` + +Reported result: + +- `flexible-extract exact_match`: `0.9439` +- `strict-match exact_match`: `0.9431` + +## DeepSeek-V4-Pro (MI355X) + +### Offline validation + +```python +import os +from vllm import LLM, SamplingParams + +os.environ["VLLM_ROCM_USE_AITER"] = "1" +os.environ["VLLM_ROCM_USE_AITER_LINEAR"] = "1" + +prompts = ["What is 2+2? Answer:", "The capital of France is "] +sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=20) + +llm = LLM( + model="/home/models/DeepSeek-V4-Pro", + tensor_parallel_size=8, + kv_cache_dtype="fp8", + gpu_memory_utilization=0.6, + async_scheduling=True, + enforce_eager=True, + disable_log_stats=False, + tokenizer_mode="deepseek_v4", + moe_backend="triton_unfused", + reasoning_parser="deepseek_v4", +) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + print(output.prompt, output.outputs[0].text) +``` + +### Online serving + +```bash +max_num_seqs=128 +max_num_batched_tokens=8192 +tensor_parallel_size=8 + +export HF_HOME=/data/huggingface-cache +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_LINEAR=1 +rm -rf /root/.cache/vllm/torch_compile_cache + +MODEL=/home/models/DeepSeek-V4-Pro +vllm serve ${MODEL} \ + --host localhost \ + --port 8001 \ + --dtype auto \ + --kv-cache-dtype fp8 \ + --tensor-parallel-size ${tensor_parallel_size} \ + --max-num-seqs ${max_num_seqs} \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --distributed-executor-backend mp \ + --trust-remote-code \ + --gpu-memory-utilization 0.6 \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --reasoning-parser deepseek_v4 \ + --async-scheduling \ + --enforce-eager +``` + +### Accuracy check (GSM8K, from PR) + +```bash +MODEL=/home/models/DeepSeek-V4-Pro +lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . +``` + +Reported result: + +- `flexible-extract exact_match`: `0.9538` +- `strict-match exact_match`: `0.9545` + +## Reasoning modes + +DeepSeek-V4 exposes non-think / think-high / think-max via `chat_template_kwargs`. + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8001/v1", api_key="EMPTY") +model = "deepseek-ai/DeepSeek-V4-Pro" +messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}] + +# Non-think +client.chat.completions.create(model=model, messages=messages) + +# Think high +client.chat.completions.create( + model=model, + messages=messages, + extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "high"}}, +) + +# Think max (ensure sufficient max-model-len) +client.chat.completions.create( + model=model, + messages=messages, + extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "max"}}, +) +``` + +## Tool calling + +Add these arguments to your serve command: + +```bash +--tokenizer-mode deepseek_v4 \ +--tool-call-parser deepseek_v4 \ +--enable-auto-tool-choice +``` + +## Speculative decoding (MTP) + +DeepSeek-V4 has native MTP support. On AMD, start conservatively and tune: + +```bash +--speculative-config '{"method":"mtp","num_speculative_tokens":1}' +``` + +If memory/throughput allows, test: + +```bash +--speculative-config '{"method":"mtp","num_speculative_tokens":2}' +``` + +## ROCm-specific notes from PR #40871 + +- ROCm path includes DeepSeek-V4 FP8 compatibility updates and E8M0 scale handling. +- ROCm execution disables some multi-stream paths to avoid known hang scenarios. +- For DeepSeek-V4 routing mode, `triton_unfused` is preferred for accuracy, with AITER as fallback. + +## Troubleshooting + +1. **`NotImplementedError: "mul_cuda" not implemented for 'Float8_e8m0fnu'`** + - Ensure you are using the PR build above (or a newer commit that includes ROCm E8M0 handling fixes). +2. **Model hangs during startup/load** + - Keep `--enforce-eager` enabled. + - Use `--moe-backend triton_unfused` on AMD. +3. **Tokenizer / reasoning mismatch** + - Verify `--tokenizer-mode deepseek_v4` and `--reasoning-parser deepseek_v4` are both set. + diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml index 8312008d..c2436e04 100644 --- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml @@ -3,7 +3,7 @@ meta: slug: "deepseek-v4-pro" provider: "DeepSeek" description: "DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning." - date_updated: 2026-04-24 + date_updated: 2026-05-01 difficulty: hard tasks: - text @@ -17,7 +17,7 @@ meta: gb300: verified mi300x: unsupported mi325x: unsupported - mi355x: unsupported + mi355x: verified model: model_id: "deepseek-ai/DeepSeek-V4-Pro" @@ -109,6 +109,23 @@ hardware_overrides: - "--attention_config.use_fp4_indexer_cache=True" - "--moe-backend" - "deep_gemm_mega_moe" + amd: + extra_args: + - "--distributed-executor-backend" + - "mp" + - "--gpu-memory-utilization" + - "0.6" + - "--max-num-seqs" + - "128" + - "--max-num-batched-tokens" + - "8192" + - "--moe-backend" + - "triton_unfused" + - "--async-scheduling" + - "--enforce-eager" + extra_env: + VLLM_ROCM_USE_AITER: "1" + VLLM_ROCM_USE_AITER_LINEAR: "1" strategy_overrides: single_node_tp: @@ -252,6 +269,10 @@ guide: | - **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at 800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params replicated across ranks — applies to both single-node and multi-node H200. + - **MI355X (8× GPU)**: validated with ROCm + AITER (`VLLM_ROCM_USE_AITER=1`, + `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`, + `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`, + `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`. - **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2. From fcd1e7e18d0c4086f95ee33ce64dfcbabcbb6206 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 05:24:01 -0500 Subject: [PATCH 02/10] Update the Deepseek-V4-Flash support Signed-off-by: wuhuikx --- models/deepseek-ai/DeepSeek-V4-Flash.yaml | 25 +++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml index 6bf204ed..b1064048 100644 --- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml @@ -3,7 +3,7 @@ meta: slug: "deepseek-v4-flash" provider: "DeepSeek" description: "DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max)." - date_updated: 2026-04-24 + date_updated: 2026-05-01 difficulty: hard tasks: - text @@ -17,7 +17,7 @@ meta: gb300: verified mi300x: unsupported mi325x: unsupported - mi355x: unsupported + mi355x: verified model: model_id: "deepseek-ai/DeepSeek-V4-Flash" @@ -91,6 +91,22 @@ hardware_overrides: - "--attention_config.use_fp4_indexer_cache=True" - "--moe-backend" - "deep_gemm_mega_moe" + amd: + extra_args: + - "--distributed-executor-backend" + - "mp" + - "--gpu-memory-utilization" + - "0.35" + - "--max-num-seqs" + - "16" + - "--max-num-batched-tokens" + - "1024" + - "--moe-backend" + - "triton_unfused" + - "--async-scheduling" + - "--enforce-eager" + extra_env: + VLLM_ROCM_USE_AITER: "1" strategy_overrides: single_node_tp: @@ -228,6 +244,11 @@ guide: | replica on H200/B200/B300 (leaving headroom for throughput-vs-latency tuning). For disaggregated prefill/decode on GB200, use the PD Cluster tab. + On **MI355X (8×288GB)**, validation used ROCm + AITER (`VLLM_ROCM_USE_AITER=1`), + `--distributed-executor-backend mp`, `--gpu-memory-utilization 0.35`, + `--max-num-seqs 16`, `--max-num-batched-tokens 1024`, + `--moe-backend triton_unfused`, `--async-scheduling`, and `--enforce-eager`. + ### H200 Single-Node PD (Mooncake) Single-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node, From 2c8b04b9f7fc4c22ea0e0a87fb4f749172b4f479 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 05:33:43 -0500 Subject: [PATCH 03/10] Update the feature matrix Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 299 +++++++++++++----------------------- 1 file changed, 108 insertions(+), 191 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 2d4073fd..529538cf 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -1,150 +1,131 @@ # DeepSeek-V4 on AMD (ROCm) Usage Guide -This recipe mirrors the official DeepSeek-V4 recipe structure and is adapted for AMD ROCm based on [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871). +This page is aligned with the DeepSeek-V4-Pro recipe layout on recipes.vllm.ai and +captures the AMD MI355X validated settings from [vllm-project/vllm#40871](https://github.com/vllm-project/vllm/pull/40871). -## Scope +## Overview -This guide covers: +DeepSeek-V4-Pro is the flagship of the V4 preview family: a 1.6T-total / 49B-active +Mixture-of-Experts model. It pairs a **hybrid attention stack** — Compressed Sparse +Attention (CSA) + Heavily Compressed Attention (HCA) — with **Manifold-Constrained +Hyper-Connections (mHC)** to reach 27% of V3.2's per-token inference FLOPs and 10% of +V3.2's KV cache at 1M context. Pre-trained on 32T+ tokens with the **Muon optimizer** +for faster convergence; post-training is a two-stage pipeline (domain-specific expert +cultivation + unified consolidation via on-policy distillation). -- DeepSeek-V4-Flash on MI355X (online serving) -- DeepSeek-V4-Pro on MI355X (offline + online serving) -- Reasoning mode usage -- Tool calling flags -- MTP speculative decoding (experimental recommendation) +Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the +remaining (attention / norm / router) params stay in FP8. -## Environment and Version +## Reasoning modes -At the time of writing, AMD DeepSeek-V4 support is under review upstream, so use the PR branch build: +The chat template exposes three reasoning-effort modes: -```bash -# inside ROCm container -pip uninstall -y vllm -git clone https://github.com/vllm-project/vllm.git -cd vllm -git fetch origin pull/40871/head:pr_dsv4 -git checkout pr_dsv4 -python3 setup.py develop -``` +- **Non-think** — fast, intuitive responses. +- **Think High** — explicit chain-of-thought for complex problem-solving and planning. +- **Think Max** — maximum reasoning effort; requires `--max-model-len >= 393216` + (384K tokens) to avoid truncation. -Reference runtime used in PR validation: +Recommended sampling: `temperature = 1.0`, `top_p = 1.0`. -- Docker image: `rocm/vllm-dev:nightly_main_20260423` -- Hardware: `MI355X` +### OpenAI Client Example -## DeepSeek-V4-Flash (MI355X) +For DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a +custom **Think Max** mode via `"reasoning_effort": "max"`. -### Launch +```python +from openai import OpenAI -```bash -max_num_seqs=16 -max_num_batched_tokens=1024 -tensor_parallel_size=4 +client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY") +model = "deepseek-ai/DeepSeek-V4-Pro" +messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}] -export HF_HOME=/data/huggingface-cache -export VLLM_ROCM_USE_AITER=1 -export VLLM_TORCH_PROFILER_DIR=/app/vllm_profile +# Non-think +resp = client.chat.completions.create( + model=model, + messages=messages, +) -MODEL=/home/models/DeepSeek-V4-Flash -vllm serve ${MODEL} \ - --host localhost \ - --port 8001 \ - --dtype auto \ - --tensor-parallel-size ${tensor_parallel_size} \ - --max-num-seqs ${max_num_seqs} \ - --max-num-batched-tokens ${max_num_batched_tokens} \ - --distributed-executor-backend mp \ - --trust-remote-code \ - --profiler-config '{"profiler":"torch","torch_profiler_dir":"./vllm_profile"}' \ - --gpu-memory-utilization 0.35 \ - --moe-backend triton_unfused \ - --tokenizer-mode deepseek_v4 \ - --async-scheduling \ - --enforce-eager +# Think High +resp = client.chat.completions.create( + model=model, + messages=messages, + extra_body={ + "chat_template_kwargs": { + "thinking": True, + "reasoning_effort": "high", + }, + }, +) + +# Think Max +resp = client.chat.completions.create( + model=model, + messages=messages, + extra_body={ + "chat_template_kwargs": { + "thinking": True, + "reasoning_effort": "max", + }, + }, +) ``` -### Smoke test +## Recommended deployments -```bash -curl -s http://localhost:8001/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Write me a poem about AMD and DeepSeek.", - "model": "/home/models/DeepSeek-V4-Flash", - "max_tokens": 100, - "temperature": 0.0 - }' -``` +- **B300 (8× GPU)**: single-node DP + EP with `--data-parallel-size 8`. +- **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at + 800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params + replicated across ranks — applies to both single-node and multi-node H200. +- **MI355X (8× GPU)**: validated with ROCm + AITER + (`VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`, + `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`, + `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`. +- **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not + fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with + `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2. -### Accuracy check (GSM8K, from PR) +## Feature matrix -```bash -MODEL=/home/models/DeepSeek-V4-Flash -lm_eval --model local-completions \ - --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \ - --batch_size auto \ - --tasks gsm8k \ - --num_fewshot 8 \ - --output_path . -``` +The table below is a static equivalent of the interactive matrix shown on +recipes.vllm.ai (hardware / variant / strategy / features). -Reported result: +| Model | Hardware | Variant | Recommended strategies | Tool calling | Reasoning | Spec decoding | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSeek-V4-Pro | MI355X (8x288GB) | FP8 (~960GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) | +| DeepSeek-V4-Flash | MI355X (8x288GB) | FP8 (~170GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) | -- `flexible-extract exact_match`: `0.9439` -- `strict-match exact_match`: `0.9431` +### MI355X recommended presets -## DeepSeek-V4-Pro (MI355X) +| Model | TP | Max num seqs | Max batched tokens | GPU memory utilization | Key ROCm env | +| --- | --- | ---: | ---: | ---: | --- | +| DeepSeek-V4-Pro | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` | +| DeepSeek-V4-Flash | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` | -### Offline validation +### Feature toggles -```python -import os -from vllm import LLM, SamplingParams - -os.environ["VLLM_ROCM_USE_AITER"] = "1" -os.environ["VLLM_ROCM_USE_AITER_LINEAR"] = "1" - -prompts = ["What is 2+2? Answer:", "The capital of France is "] -sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=20) - -llm = LLM( - model="/home/models/DeepSeek-V4-Pro", - tensor_parallel_size=8, - kv_cache_dtype="fp8", - gpu_memory_utilization=0.6, - async_scheduling=True, - enforce_eager=True, - disable_log_stats=False, - tokenizer_mode="deepseek_v4", - moe_backend="triton_unfused", - reasoning_parser="deepseek_v4", -) +| Feature | Server args | +| --- | --- | +| Tool Calling | `--tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 --enable-auto-tool-choice` | +| Reasoning | `--reasoning-parser deepseek_v4` | +| Spec Decoding | `--speculative-config '{"method":"mtp","num_speculative_tokens":1}'` (start) / `2` (tune) | -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - print(output.prompt, output.outputs[0].text) -``` +## AMD validation command snippets -### Online serving +### DeepSeek-V4-Pro (MI355X, TP=8) ```bash -max_num_seqs=128 -max_num_batched_tokens=8192 -tensor_parallel_size=8 - export HF_HOME=/data/huggingface-cache export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_LINEAR=1 -rm -rf /root/.cache/vllm/torch_compile_cache -MODEL=/home/models/DeepSeek-V4-Pro -vllm serve ${MODEL} \ +vllm serve /home/models/DeepSeek-V4-Pro \ --host localhost \ --port 8001 \ --dtype auto \ --kv-cache-dtype fp8 \ - --tensor-parallel-size ${tensor_parallel_size} \ - --max-num-seqs ${max_num_seqs} \ - --max-num-batched-tokens ${max_num_batched_tokens} \ + --tensor-parallel-size 8 \ + --max-num-seqs 128 \ + --max-num-batched-tokens 8192 \ --distributed-executor-backend mp \ --trust-remote-code \ --gpu-memory-utilization 0.6 \ @@ -155,89 +136,25 @@ vllm serve ${MODEL} \ --enforce-eager ``` -### Accuracy check (GSM8K, from PR) - -```bash -MODEL=/home/models/DeepSeek-V4-Pro -lm_eval --model local-completions \ - --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ - --batch_size auto \ - --tasks gsm8k \ - --num_fewshot 8 \ - --output_path . -``` - -Reported result: - -- `flexible-extract exact_match`: `0.9538` -- `strict-match exact_match`: `0.9545` - -## Reasoning modes - -DeepSeek-V4 exposes non-think / think-high / think-max via `chat_template_kwargs`. - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:8001/v1", api_key="EMPTY") -model = "deepseek-ai/DeepSeek-V4-Pro" -messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}] - -# Non-think -client.chat.completions.create(model=model, messages=messages) - -# Think high -client.chat.completions.create( - model=model, - messages=messages, - extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "high"}}, -) - -# Think max (ensure sufficient max-model-len) -client.chat.completions.create( - model=model, - messages=messages, - extra_body={"chat_template_kwargs": {"thinking": True, "reasoning_effort": "max"}}, -) -``` - -## Tool calling - -Add these arguments to your serve command: +### DeepSeek-V4-Flash (MI355X, TP=4) ```bash ---tokenizer-mode deepseek_v4 \ ---tool-call-parser deepseek_v4 \ ---enable-auto-tool-choice -``` - -## Speculative decoding (MTP) - -DeepSeek-V4 has native MTP support. On AMD, start conservatively and tune: - -```bash ---speculative-config '{"method":"mtp","num_speculative_tokens":1}' -``` - -If memory/throughput allows, test: +export HF_HOME=/data/huggingface-cache +export VLLM_ROCM_USE_AITER=1 -```bash ---speculative-config '{"method":"mtp","num_speculative_tokens":2}' +vllm serve /home/models/DeepSeek-V4-Flash \ + --host localhost \ + --port 8001 \ + --dtype auto \ + --tensor-parallel-size 4 \ + --max-num-seqs 16 \ + --max-num-batched-tokens 1024 \ + --distributed-executor-backend mp \ + --trust-remote-code \ + --gpu-memory-utilization 0.35 \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --async-scheduling \ + --enforce-eager ``` -## ROCm-specific notes from PR #40871 - -- ROCm path includes DeepSeek-V4 FP8 compatibility updates and E8M0 scale handling. -- ROCm execution disables some multi-stream paths to avoid known hang scenarios. -- For DeepSeek-V4 routing mode, `triton_unfused` is preferred for accuracy, with AITER as fallback. - -## Troubleshooting - -1. **`NotImplementedError: "mul_cuda" not implemented for 'Float8_e8m0fnu'`** - - Ensure you are using the PR build above (or a newer commit that includes ROCm E8M0 handling fixes). -2. **Model hangs during startup/load** - - Keep `--enforce-eager` enabled. - - Use `--moe-backend triton_unfused` on AMD. -3. **Tokenizer / reasoning mismatch** - - Verify `--tokenizer-mode deepseek_v4` and `--reasoning-parser deepseek_v4` are both set. - From 038809e332e646453d777dcd325eb2b294de094d Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 05:39:41 -0500 Subject: [PATCH 04/10] Update the recipe Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 160 +++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 66 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 529538cf..a3d514a8 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -16,73 +16,12 @@ cultivation + unified consolidation via on-policy distillation). Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the remaining (attention / norm / router) params stay in FP8. -## Reasoning modes - -The chat template exposes three reasoning-effort modes: - -- **Non-think** — fast, intuitive responses. -- **Think High** — explicit chain-of-thought for complex problem-solving and planning. -- **Think Max** — maximum reasoning effort; requires `--max-model-len >= 393216` - (384K tokens) to avoid truncation. - -Recommended sampling: `temperature = 1.0`, `top_p = 1.0`. - -### OpenAI Client Example - -For DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a -custom **Think Max** mode via `"reasoning_effort": "max"`. - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY") -model = "deepseek-ai/DeepSeek-V4-Pro" -messages = [{"role": "user", "content": "What is 17*19? Return only the final integer."}] - -# Non-think -resp = client.chat.completions.create( - model=model, - messages=messages, -) - -# Think High -resp = client.chat.completions.create( - model=model, - messages=messages, - extra_body={ - "chat_template_kwargs": { - "thinking": True, - "reasoning_effort": "high", - }, - }, -) - -# Think Max -resp = client.chat.completions.create( - model=model, - messages=messages, - extra_body={ - "chat_template_kwargs": { - "thinking": True, - "reasoning_effort": "max", - }, - }, -) -``` - ## Recommended deployments -- **B300 (8× GPU)**: single-node DP + EP with `--data-parallel-size 8`. -- **H200 (8× GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at - 800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params - replicated across ranks — applies to both single-node and multi-node H200. - **MI355X (8× GPU)**: validated with ROCm + AITER (`VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1`), `--moe-backend triton_unfused`, `--gpu-memory-utilization 0.6`, `--max-num-seqs 128`, `--max-num-batched-tokens 8192`, and `--distributed-executor-backend mp`. -- **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not - fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with - `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2. ## Feature matrix @@ -91,15 +30,15 @@ recipes.vllm.ai (hardware / variant / strategy / features). | Model | Hardware | Variant | Recommended strategies | Tool calling | Reasoning | Spec decoding | | --- | --- | --- | --- | --- | --- | --- | -| DeepSeek-V4-Pro | MI355X (8x288GB) | FP8 (~960GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) | -| DeepSeek-V4-Flash | MI355X (8x288GB) | FP8 (~170GB) | Tensor+Expert Parallel, Data+Expert Parallel | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | Yes (`mtp`) | +| [DeepSeek-V4-Pro](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) | MI355X (8x288GB) | FP8 (~960GB) | Tensor Parallel (TP) | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | No (`false`) | +| [DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | MI355X (8x288GB) | FP8 (~170GB) | Tensor Parallel (TP) | Yes (`deepseek_v4`) | Yes (`deepseek_v4`) | No (`false`) | ### MI355X recommended presets | Model | TP | Max num seqs | Max batched tokens | GPU memory utilization | Key ROCm env | | --- | --- | ---: | ---: | ---: | --- | -| DeepSeek-V4-Pro | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` | -| DeepSeek-V4-Flash | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` | +| [DeepSeek-V4-Pro](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) | 8 | 128 | 8192 | 0.6 | `VLLM_ROCM_USE_AITER=1`, `VLLM_ROCM_USE_AITER_LINEAR=1` | +| [DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | 4 | 16 | 1024 | 0.35 | `VLLM_ROCM_USE_AITER=1` | ### Feature toggles @@ -107,7 +46,7 @@ recipes.vllm.ai (hardware / variant / strategy / features). | --- | --- | | Tool Calling | `--tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 --enable-auto-tool-choice` | | Reasoning | `--reasoning-parser deepseek_v4` | -| Spec Decoding | `--speculative-config '{"method":"mtp","num_speculative_tokens":1}'` (start) / `2` (tune) | +| Spec Decoding | Disabled (`false`) | ## AMD validation command snippets @@ -158,3 +97,92 @@ vllm serve /home/models/DeepSeek-V4-Flash \ --enforce-eager ``` +## Smoke test (single request) + +### DeepSeek-V4-Flash + +```bash +curl -s http://localhost:8001/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Write me a poem about AMD and DeepSeek", + "model": "/home/models/DeepSeek-V4-Flash", + "max_tokens": 100, + "temperature": 0.0 + }' +``` + +Sample result from PR validation (truncated): + +```json +{ + "object": "text_completion", + "model": "/home/models/DeepSeek-V4-Flash", + "choices": [ + { + "finish_reason": "length", + "text": "\"... Here's a poem about AMD and DeepSeek: ...\"" + } + ], + "usage": { + "prompt_tokens": 9, + "completion_tokens": 100, + "total_tokens": 109 + } +} +``` + +### DeepSeek-V4-Pro + +```bash +curl -s http://localhost:8001/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "What is 2+2? Return only the final integer.", + "model": "/home/models/DeepSeek-V4-Pro", + "max_tokens": 16, + "temperature": 0.0 + }' +``` + +Smoke-test success criteria: + +- HTTP status is `200` +- `choices[0].text` is non-empty + +## GSM8K validation (command + result) + +### DeepSeek-V4-Flash + +```bash +MODEL=/home/models/DeepSeek-V4-Flash +lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . +``` + +Reported result from PR #40871: + +- `flexible-extract exact_match`: `0.9439` +- `strict-match exact_match`: `0.9431` + +### DeepSeek-V4-Pro + +```bash +MODEL=/home/models/DeepSeek-V4-Pro +lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . +``` + +Reported result from PR #40871: + +- `flexible-extract exact_match`: `0.9538` +- `strict-match exact_match`: `0.9545` + From 3d41f2ee5b1a7ecb9e5bbd7d6bc2e395557b6af3 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 07:41:30 -0500 Subject: [PATCH 05/10] Update the smoking test result Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 108 +++++++++------------- models/deepseek-ai/DeepSeek-V4-Flash.yaml | 46 +++++++++ models/deepseek-ai/DeepSeek-V4-Pro.yaml | 49 ++++++++++ 3 files changed, 141 insertions(+), 62 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index a3d514a8..9ce74980 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -48,9 +48,9 @@ recipes.vllm.ai (hardware / variant / strategy / features). | Reasoning | `--reasoning-parser deepseek_v4` | | Spec Decoding | Disabled (`false`) | -## AMD validation command snippets +## DeepSeek-V4-Pro validation (MI355X, TP=8) -### DeepSeek-V4-Pro (MI355X, TP=8) +### 1) Serve command ```bash export HF_HOME=/data/huggingface-cache @@ -75,7 +75,44 @@ vllm serve /home/models/DeepSeek-V4-Pro \ --enforce-eager ``` -### DeepSeek-V4-Flash (MI355X, TP=4) +### 2) Smoke test (single request) + +```bash +curl -s http://localhost:8001/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "What is 2+2? Return only the final integer.", + "model": "/home/models/DeepSeek-V4-Pro", + "max_tokens": 16, + "temperature": 0.0 + }' +``` + +Smoke-test success criteria: + +- HTTP status is `200` +- `choices[0].text` is non-empty + +### 3) GSM8K validation + +```bash +MODEL=/home/models/DeepSeek-V4-Pro +lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . +``` + +Reported result from PR #40871: + +- `flexible-extract exact_match`: `0.9538` +- `strict-match exact_match`: `0.9545` + +## DeepSeek-V4-Flash validation (MI355X, TP=4) + +### 1) Serve command ```bash export HF_HOME=/data/huggingface-cache @@ -97,62 +134,26 @@ vllm serve /home/models/DeepSeek-V4-Flash \ --enforce-eager ``` -## Smoke test (single request) - -### DeepSeek-V4-Flash +### 2) Smoke test (single request) ```bash curl -s http://localhost:8001/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "prompt": "Write me a poem about AMD and DeepSeek", - "model": "/home/models/DeepSeek-V4-Flash", + "prompt": "Introduce the capital of US", + "model": "/models/DeepSeek-V4-Flash", "max_tokens": 100, "temperature": 0.0 }' ``` -Sample result from PR validation (truncated): +Sample result: ```json -{ - "object": "text_completion", - "model": "/home/models/DeepSeek-V4-Flash", - "choices": [ - { - "finish_reason": "length", - "text": "\"... Here's a poem about AMD and DeepSeek: ...\"" - } - ], - "usage": { - "prompt_tokens": 9, - "completion_tokens": 100, - "total_tokens": 109 - } -} +{"id":"cmpl-86e0959d4415d914","object":"text_completion","created":1777638722,"model":"/models/DeepSeek-V4-Flash","choices":[{"index":0,"text":"\",\"answer\":\"Washington, D.C.\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Canada.\",\"answer\":\"Ottawa\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Mexico.\",\"answer\":\"Mexico City\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Brazil.\",\"answer\":\"Brasília\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Argentina.\",\"answer\":\"Buenos Aires\",\"type\":\"text\"},{\"question\":\"Introdu","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp4-015676fd","usage":{"prompt_tokens":7,"total_tokens":107,"completion_tokens":100,"prompt_tokens_details":null},"kv_transfer_params":null} ``` -### DeepSeek-V4-Pro - -```bash -curl -s http://localhost:8001/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "What is 2+2? Return only the final integer.", - "model": "/home/models/DeepSeek-V4-Pro", - "max_tokens": 16, - "temperature": 0.0 - }' -``` - -Smoke-test success criteria: - -- HTTP status is `200` -- `choices[0].text` is non-empty - -## GSM8K validation (command + result) - -### DeepSeek-V4-Flash +### 3) GSM8K validation ```bash MODEL=/home/models/DeepSeek-V4-Flash @@ -169,20 +170,3 @@ Reported result from PR #40871: - `flexible-extract exact_match`: `0.9439` - `strict-match exact_match`: `0.9431` -### DeepSeek-V4-Pro - -```bash -MODEL=/home/models/DeepSeek-V4-Pro -lm_eval --model local-completions \ - --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ - --batch_size auto \ - --tasks gsm8k \ - --num_fewshot 8 \ - --output_path . -``` - -Reported result from PR #40871: - -- `flexible-extract exact_match`: `0.9538` -- `strict-match exact_match`: `0.9545` - diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml index b1064048..235c1435 100644 --- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml @@ -249,6 +249,52 @@ guide: | `--max-num-seqs 16`, `--max-num-batched-tokens 1024`, `--moe-backend triton_unfused`, `--async-scheduling`, and `--enforce-eager`. + ## GSM8K validation (MI355X) + + Launch command (TP=4): + + ```bash + export HF_HOME=/data/huggingface-cache + export VLLM_ROCM_USE_AITER=1 + + vllm serve /home/models/DeepSeek-V4-Flash \ + --host localhost \ + --port 8001 \ + --dtype auto \ + --tensor-parallel-size 4 \ + --max-num-seqs 16 \ + --max-num-batched-tokens 1024 \ + --distributed-executor-backend mp \ + --trust-remote-code \ + --gpu-memory-utilization 0.35 \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --async-scheduling \ + --enforce-eager + ``` + + GSM8K command: + + ```bash + MODEL=/home/models/DeepSeek-V4-Flash + lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=4,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . 2>&1 | tee -a eval.log + ``` + + Reported result from PR #40871: + + ```text + local-completions ({'model': '/home/models/DeepSeek-V4-Flash', 'base_url': 'http://0.0.0.0:8001/v1/completions', 'num_concurrent': 4, 'max_retries': 10, 'max_gen_toks': 2048, 'timeout': 60000}), gen_kwargs: ({}), limit: None, num_fewshot: 8, batch_size: auto + |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| + |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| + |gsm8k| 3|flexible-extract| 8|exact_match|↑ |0.9439|± |0.0063| + | | |strict-match | 8|exact_match|↑ |0.9431|± |0.0064| + ``` + ### H200 Single-Node PD (Mooncake) Single-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node, diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml index c2436e04..a68354ea 100644 --- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml @@ -276,3 +276,52 @@ guide: | - **GB200 NVL4 (4× GPU per tray)**: the ~960 GB mixed-precision checkpoint does not fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with `--data-parallel-size 8`. Pick the "Multi-Node" tab and set nodes to 2. + + ## GSM8K validation (MI355X) + + Launch command (TP=8): + + ```bash + export HF_HOME=/data/huggingface-cache + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_LINEAR=1 + + vllm serve /home/models/DeepSeek-V4-Pro \ + --host localhost \ + --port 8001 \ + --dtype auto \ + --kv-cache-dtype fp8 \ + --tensor-parallel-size 8 \ + --max-num-seqs 128 \ + --max-num-batched-tokens 8192 \ + --distributed-executor-backend mp \ + --trust-remote-code \ + --gpu-memory-utilization 0.6 \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --reasoning-parser deepseek_v4 \ + --async-scheduling \ + --enforce-eager + ``` + + GSM8K command: + + ```bash + MODEL=/home/models/DeepSeek-V4-Pro + lm_eval --model local-completions \ + --model_args model=$MODEL,base_url=http://0.0.0.0:8001/v1/completions,num_concurrent=2,max_retries=10,max_gen_toks=2048,timeout=60000 \ + --batch_size auto \ + --tasks gsm8k \ + --num_fewshot 8 \ + --output_path . 2>&1 | tee -a eval.log + ``` + + Reported result from PR #40871: + + ```text + local-completions ({'model': '/home/models/DeepSeek-V4-Pro', 'base_url': 'http://0.0.0.0:8001/v1/completions', 'num_concurrent': 2, 'max_retries': 10, 'max_gen_toks': 2048, 'timeout': 60000}), gen_kwargs: ({}), limit: None, num_fewshot: 8, batch_size: auto + |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| + |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| + |gsm8k| 3|flexible-extract| 8|exact_match|↑ |0.9538|± |0.0058| + | | |strict-match | 8|exact_match|↑ |0.9545|± |0.0057| + ``` From 9271152da524b69bee81e95c2c93e4374ffd2ee1 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 08:12:25 -0500 Subject: [PATCH 06/10] Update the gms8k result Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 9ce74980..60925974 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -88,6 +88,12 @@ curl -s http://localhost:8001/v1/completions \ }' ``` +Sample result: + +```json +{"id":"cmpl-973e09361657d259","object":"text_completion","created":1777640598,"model":"/models/DeepSeek-V4-Pro","choices":[{"index":0,"text":" Do not include any other text or explanation. The answer is 4.\nWhat","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp8-868a6cb7","usage":{"prompt_tokens":13,"total_tokens":29,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null} +``` + Smoke-test success criteria: - HTTP status is `200` @@ -107,8 +113,12 @@ lm_eval --model local-completions \ Reported result from PR #40871: -- `flexible-extract exact_match`: `0.9538` -- `strict-match exact_match`: `0.9545` +```text +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 8|exact_match|↑ |0.9538|± |0.0058| +| | |strict-match | 8|exact_match|↑ |0.9545|± |0.0057| +``` ## DeepSeek-V4-Flash validation (MI355X, TP=4) @@ -167,6 +177,10 @@ lm_eval --model local-completions \ Reported result from PR #40871: -- `flexible-extract exact_match`: `0.9439` -- `strict-match exact_match`: `0.9431` +```text +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 8|exact_match|↑ |0.9439|± |0.0063| +| | |strict-match | 8|exact_match|↑ |0.9431|± |0.0064| +``` From 0e0ada6c90d1b1515ecb3027cf76bbaba2509560 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 08:19:11 -0500 Subject: [PATCH 07/10] Remove the smoke result Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 47 ++----------------------------------- 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 60925974..cd133bfa 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -75,31 +75,7 @@ vllm serve /home/models/DeepSeek-V4-Pro \ --enforce-eager ``` -### 2) Smoke test (single request) - -```bash -curl -s http://localhost:8001/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "What is 2+2? Return only the final integer.", - "model": "/home/models/DeepSeek-V4-Pro", - "max_tokens": 16, - "temperature": 0.0 - }' -``` - -Sample result: - -```json -{"id":"cmpl-973e09361657d259","object":"text_completion","created":1777640598,"model":"/models/DeepSeek-V4-Pro","choices":[{"index":0,"text":" Do not include any other text or explanation. The answer is 4.\nWhat","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp8-868a6cb7","usage":{"prompt_tokens":13,"total_tokens":29,"completion_tokens":16,"prompt_tokens_details":null},"kv_transfer_params":null} -``` - -Smoke-test success criteria: - -- HTTP status is `200` -- `choices[0].text` is non-empty - -### 3) GSM8K validation +### 2) GSM8K validation ```bash MODEL=/home/models/DeepSeek-V4-Pro @@ -144,26 +120,7 @@ vllm serve /home/models/DeepSeek-V4-Flash \ --enforce-eager ``` -### 2) Smoke test (single request) - -```bash -curl -s http://localhost:8001/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Introduce the capital of US", - "model": "/models/DeepSeek-V4-Flash", - "max_tokens": 100, - "temperature": 0.0 - }' -``` - -Sample result: - -```json -{"id":"cmpl-86e0959d4415d914","object":"text_completion","created":1777638722,"model":"/models/DeepSeek-V4-Flash","choices":[{"index":0,"text":"\",\"answer\":\"Washington, D.C.\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Canada.\",\"answer\":\"Ottawa\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Mexico.\",\"answer\":\"Mexico City\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Brazil.\",\"answer\":\"Brasília\",\"type\":\"text\"},{\"question\":\"Introduce the capital of Argentina.\",\"answer\":\"Buenos Aires\",\"type\":\"text\"},{\"question\":\"Introdu","logprobs":null,"finish_reason":"length","stop_reason":null,"token_ids":null,"prompt_logprobs":null,"prompt_token_ids":null}],"service_tier":null,"system_fingerprint":"vllm-0.20.1rc1.dev135+ge786a2dfc-tp4-015676fd","usage":{"prompt_tokens":7,"total_tokens":107,"completion_tokens":100,"prompt_tokens_details":null},"kv_transfer_params":null} -``` - -### 3) GSM8K validation +### 2) GSM8K validation ```bash MODEL=/home/models/DeepSeek-V4-Flash From a6f74b805767ff6442823183e34ca39d40ee0cc1 Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Fri, 1 May 2026 11:36:00 -0500 Subject: [PATCH 08/10] Add docker info Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 16 ++++++++++++++++ models/deepseek-ai/DeepSeek-V4-Flash.yaml | 2 ++ models/deepseek-ai/DeepSeek-V4-Pro.yaml | 2 ++ 3 files changed, 20 insertions(+) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index cd133bfa..25455237 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -16,6 +16,12 @@ cultivation + unified consolidation via on-policy distillation). Checkpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the remaining (attention / norm / router) params stay in FP8. +## Docker image (AMD ROCm) + +```bash +docker pull rocm/vllm-dev:deepseek-v4-latest +``` + ## Recommended deployments - **MI355X (8× GPU)**: validated with ROCm + AITER @@ -141,3 +147,13 @@ Reported result from PR #40871: | | |strict-match | 8|exact_match|↑ |0.9431|± |0.0064| ``` +## Related PR links + +- [Functionality] Base PR is functionality/accuracy ready on MI35x for both + DeepSeek-V4-Pro and DeepSeek-V4-Flash; lm_eval passed on full GSM8K: + [Ready to merge, #40871](https://github.com/vllm-project/vllm/pull/40871) +- [Functionality] MI300 support PR: + [#41451](https://github.com/vllm-project/vllm/pull/41451) +- [Performance] MLA Indexer optimization for DeepSeek-V4 and DeepSeek-V3.2 (ROCm): + [#41217](https://github.com/vllm-project/vllm/pull/41217) + diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml index 235c1435..ac8bb855 100644 --- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml @@ -22,6 +22,8 @@ meta: model: model_id: "deepseek-ai/DeepSeek-V4-Flash" min_vllm_version: "0.20.0" + docker_image: + amd: "rocm/vllm-dev:deepseek-v4-latest" architecture: moe parameter_count: "284B" active_parameters: "13B" diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml index a68354ea..adfdcc76 100644 --- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml @@ -22,6 +22,8 @@ meta: model: model_id: "deepseek-ai/DeepSeek-V4-Pro" min_vllm_version: "0.20.0" + docker_image: + amd: "rocm/vllm-dev:deepseek-v4-latest" architecture: moe parameter_count: "1600B" active_parameters: "49B" From 13db1970e2b11e9426937e4b80344b665531ad2f Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Wed, 6 May 2026 10:10:00 -0500 Subject: [PATCH 09/10] Update the docker to vllm/vllm-open-rocm:nightly Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 2 +- models/deepseek-ai/DeepSeek-V4-Flash.yaml | 2 +- models/deepseek-ai/DeepSeek-V4-Pro.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 25455237..5c2933ca 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -19,7 +19,7 @@ remaining (attention / norm / router) params stay in FP8. ## Docker image (AMD ROCm) ```bash -docker pull rocm/vllm-dev:deepseek-v4-latest +docker pull vllm/vllm-open-rocm:nightly ``` ## Recommended deployments diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml index ac8bb855..ac4cf33f 100644 --- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml @@ -23,7 +23,7 @@ model: model_id: "deepseek-ai/DeepSeek-V4-Flash" min_vllm_version: "0.20.0" docker_image: - amd: "rocm/vllm-dev:deepseek-v4-latest" + amd: "vllm/vllm-open-rocm:nightly" architecture: moe parameter_count: "284B" active_parameters: "13B" diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml index adfdcc76..d65e8940 100644 --- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml @@ -23,7 +23,7 @@ model: model_id: "deepseek-ai/DeepSeek-V4-Pro" min_vllm_version: "0.20.0" docker_image: - amd: "rocm/vllm-dev:deepseek-v4-latest" + amd: "vllm/vllm-open-rocm:nightly" architecture: moe parameter_count: "1600B" active_parameters: "49B" From d6dc5cc57037974047caa029f4f0d3e08402005b Mon Sep 17 00:00:00 2001 From: wuhuikx Date: Wed, 6 May 2026 21:44:22 -0500 Subject: [PATCH 10/10] Update the docker to vllm/vllm-openai-rocm:nightly Signed-off-by: wuhuikx --- DeepSeek/DeepSeek-V4-AMD.md | 2 +- models/deepseek-ai/DeepSeek-V4-Flash.yaml | 2 +- models/deepseek-ai/DeepSeek-V4-Pro.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DeepSeek/DeepSeek-V4-AMD.md b/DeepSeek/DeepSeek-V4-AMD.md index 5c2933ca..2830f09f 100644 --- a/DeepSeek/DeepSeek-V4-AMD.md +++ b/DeepSeek/DeepSeek-V4-AMD.md @@ -19,7 +19,7 @@ remaining (attention / norm / router) params stay in FP8. ## Docker image (AMD ROCm) ```bash -docker pull vllm/vllm-open-rocm:nightly +docker pull vllm/vllm-openai-rocm:nightly ``` ## Recommended deployments diff --git a/models/deepseek-ai/DeepSeek-V4-Flash.yaml b/models/deepseek-ai/DeepSeek-V4-Flash.yaml index ac4cf33f..067fb1fe 100644 --- a/models/deepseek-ai/DeepSeek-V4-Flash.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Flash.yaml @@ -23,7 +23,7 @@ model: model_id: "deepseek-ai/DeepSeek-V4-Flash" min_vllm_version: "0.20.0" docker_image: - amd: "vllm/vllm-open-rocm:nightly" + amd: "vllm/vllm-openai-rocm:nightly" architecture: moe parameter_count: "284B" active_parameters: "13B" diff --git a/models/deepseek-ai/DeepSeek-V4-Pro.yaml b/models/deepseek-ai/DeepSeek-V4-Pro.yaml index d65e8940..1c6e6adc 100644 --- a/models/deepseek-ai/DeepSeek-V4-Pro.yaml +++ b/models/deepseek-ai/DeepSeek-V4-Pro.yaml @@ -23,7 +23,7 @@ model: model_id: "deepseek-ai/DeepSeek-V4-Pro" min_vllm_version: "0.20.0" docker_image: - amd: "vllm/vllm-open-rocm:nightly" + amd: "vllm/vllm-openai-rocm:nightly" architecture: moe parameter_count: "1600B" active_parameters: "49B"