From dacf068c6b7a2311dae0bff299dd17e41071817f Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 13:04:24 -0700 Subject: [PATCH 1/2] Improve dsv4-fp8-mi355x-vllm with vllm-project/recipes#433 MI355X recipe Adopt the validated DeepSeek-V4-Pro MI355X (TP=8) settings from vllm-project/recipes#433 for the existing AITER MLA decode benchmark: * Add VLLM_ROCM_USE_AITER_LINEAR=1 env var * Add --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling server flags * Tune --gpu-memory-utilization 0.90 -> 0.6 and --max-num-seqs 32 -> 128 * Drop --tool-call-parser / --enable-auto-tool-choice (not in recipe, not exercised by these throughput benchmarks) * Expand sweep from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 allows it --- .github/configs/amd-master.yaml | 9 +++++++-- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 17 +++++++++++++---- perf-changelog.yaml | 11 +++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 692725bc1..275d4cf1c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1617,6 +1617,11 @@ dsv4-fp4-mi355x-sglang: # at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a # pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm # MI355X image and remove the build step. +# +# Serving flags follow vllm-project/recipes#433: AITER+AITER_LINEAR, +# mp executor, triton_unfused MoE, async scheduling, max-num-seqs=128, +# max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the +# sister sglang config (conc 4-64) so vLLM↔SGLang are comparable. dsv4-fp8-mi355x-vllm: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: deepseek-ai/DeepSeek-V4-Pro @@ -1630,11 +1635,11 @@ dsv4-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } + - { tp: 8, conc-start: 4, conc-end: 64 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 642700a52..edb0aac31 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -5,6 +5,13 @@ set -eo pipefail # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, # stacked on #40871 which adds base DSv4 ROCm support). # +# Serving flags follow the validated MI355X recipe from +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): AITER + AITER_LINEAR, +# triton_unfused MoE, mp executor, async scheduling, max-num-seqs=128, +# max-num-batched-tokens=8192, gpu-mem-util=0.6. Tool-call flags from the +# previous revision are dropped — the recipe omits them and throughput +# benchmarks here do not exercise tool calling. +# # Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10, # aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from # the PR branch on top. Once both PRs merge into a release, switch to @@ -33,6 +40,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_LINEAR=1 export VLLM_TARGET_DEVICE=rocm export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_PLUGINS="" @@ -487,17 +495,18 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size $TP \ - --gpu-memory-utilization 0.90 \ + --distributed-executor-backend mp \ + --gpu-memory-utilization 0.6 \ --max-model-len $MAX_MODEL_LEN \ + --max-num-seqs 128 \ + --max-num-batched-tokens 8192 \ --kv-cache-dtype fp8 \ --trust-remote-code \ --enforce-eager \ + --async-scheduling \ --moe-backend "triton_unfused" \ --no-enable-prefix-caching \ - --max-num-seqs 32 \ --tokenizer-mode deepseek_v4 \ - --tool-call-parser deepseek_v4 \ - --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7feb906c5..4eb27e704 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2442,3 +2442,14 @@ description: - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1329 + +- config-keys: + - dsv4-fp8-mi355x-vllm + description: + - "Adopt validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8)" + - "Add env: VLLM_ROCM_USE_AITER_LINEAR=1 (alongside existing VLLM_ROCM_USE_AITER=1)" + - "Add server flags: --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling" + - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128" + - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)" + - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX From 5d62ead0f8b3fbb33170b7ba9e67f7fb041d4c3c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 13:04:46 -0700 Subject: [PATCH 2/2] Backfill PR #1373 link in perf-changelog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4eb27e704..87993f5bc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2452,4 +2452,4 @@ - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128" - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)" - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1373