SemiAnalysisAI · Oseltamivir · May 13, 2026 · May 13, 2026 · May 13, 2026 · claude
@@ -1617,6 +1617,11 @@ dsv4-fp4-mi355x-sglang:
 # at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a
 # pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm
 # MI355X image and remove the build step.
+#
+# Serving flags follow vllm-project/recipes#433: AITER+AITER_LINEAR,
+# mp executor, triton_unfused MoE, async scheduling, max-num-seqs=128,
+# max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the
+# sister sglang config (conc 4-64) so vLLM↔SGLang are comparable.
 dsv4-fp8-mi355x-vllm:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1630,11 +1635,11 @@ dsv4-fp8-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -5,6 +5,13 @@ set -eo pipefail
 # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode,
 # stacked on #40871 which adds base DSv4 ROCm support).
 #
+# Serving flags follow the validated MI355X recipe from
+# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): AITER + AITER_LINEAR,
+# triton_unfused MoE, mp executor, async scheduling, max-num-seqs=128,
+# max-num-batched-tokens=8192, gpu-mem-util=0.6. Tool-call flags from the
+# previous revision are dropped — the recipe omits them and throughput
+# benchmarks here do not exercise tool calling.
+#
 # Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10,
 # aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from
 # the PR branch on top. Once both PRs merge into a release, switch to
@@ -33,6 +40,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_LINEAR=1
 export VLLM_TARGET_DEVICE=rocm
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_PLUGINS=""
@@ -487,17 +495,18 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
     --tensor-parallel-size $TP \
-    --gpu-memory-utilization 0.90 \
+    --distributed-executor-backend mp \
+    --gpu-memory-utilization 0.6 \
     --max-model-len $MAX_MODEL_LEN \
+    --max-num-seqs 128 \
+    --max-num-batched-tokens 8192 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
     --enforce-eager \
+    --async-scheduling \
     --moe-backend "triton_unfused" \
     --no-enable-prefix-caching \
-    --max-num-seqs 32 \
     --tokenizer-mode deepseek_v4 \
-    --tool-call-parser deepseek_v4 \
-    --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2454,3 +2454,14 @@
   description:
     - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1329
+
+- config-keys:
+    - dsv4-fp8-mi355x-vllm
+  description:
+    - "Adopt validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8)"
+    - "Add env: VLLM_ROCM_USE_AITER_LINEAR=1 (alongside existing VLLM_ROCM_USE_AITER=1)"
+    - "Add server flags: --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling"
+    - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128"
+    - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)"
+    - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1373