From dacf068c6b7a2311dae0bff299dd17e41071817f Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 13 May 2026 13:04:24 -0700
Subject: [PATCH 1/2] Improve dsv4-fp8-mi355x-vllm with
 vllm-project/recipes#433 MI355X recipe

Adopt the validated DeepSeek-V4-Pro MI355X (TP=8) settings from
vllm-project/recipes#433 for the existing AITER MLA decode benchmark:

* Add VLLM_ROCM_USE_AITER_LINEAR=1 env var
* Add --distributed-executor-backend mp, --max-num-batched-tokens 8192,
  --async-scheduling server flags
* Tune --gpu-memory-utilization 0.90 -> 0.6 and --max-num-seqs 32 -> 128
* Drop --tool-call-parser / --enable-auto-tool-choice (not in recipe,
  not exercised by these throughput benchmarks)
* Expand sweep from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang
  for vLLM<->SGLang comparability now that max-num-seqs=128 allows it
---
 .github/configs/amd-master.yaml                |  9 +++++++--
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 17 +++++++++++++----
 perf-changelog.yaml                            | 11 +++++++++++
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 692725bc1..275d4cf1c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1617,6 +1617,11 @@ dsv4-fp4-mi355x-sglang:
 # at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a
 # pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm
 # MI355X image and remove the build step.
+#
+# Serving flags follow vllm-project/recipes#433: AITER+AITER_LINEAR,
+# mp executor, triton_unfused MoE, async scheduling, max-num-seqs=128,
+# max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the
+# sister sglang config (conc 4-64) so vLLM↔SGLang are comparable.
 dsv4-fp8-mi355x-vllm:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1630,11 +1635,11 @@ dsv4-fp8-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 642700a52..edb0aac31 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -5,6 +5,13 @@ set -eo pipefail
 # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode,
 # stacked on #40871 which adds base DSv4 ROCm support).
 #
+# Serving flags follow the validated MI355X recipe from
+# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): AITER + AITER_LINEAR,
+# triton_unfused MoE, mp executor, async scheduling, max-num-seqs=128,
+# max-num-batched-tokens=8192, gpu-mem-util=0.6. Tool-call flags from the
+# previous revision are dropped — the recipe omits them and throughput
+# benchmarks here do not exercise tool calling.
+#
 # Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10,
 # aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from
 # the PR branch on top. Once both PRs merge into a release, switch to
@@ -33,6 +40,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_LINEAR=1
 export VLLM_TARGET_DEVICE=rocm
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_PLUGINS=""
@@ -487,17 +495,18 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
     --tensor-parallel-size $TP \
-    --gpu-memory-utilization 0.90 \
+    --distributed-executor-backend mp \
+    --gpu-memory-utilization 0.6 \
     --max-model-len $MAX_MODEL_LEN \
+    --max-num-seqs 128 \
+    --max-num-batched-tokens 8192 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
     --enforce-eager \
+    --async-scheduling \
     --moe-backend "triton_unfused" \
     --no-enable-prefix-caching \
-    --max-num-seqs 32 \
     --tokenizer-mode deepseek_v4 \
-    --tool-call-parser deepseek_v4 \
-    --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7feb906c5..4eb27e704 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2442,3 +2442,14 @@
   description:
     - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1329
+
+- config-keys:
+    - dsv4-fp8-mi355x-vllm
+  description:
+    - "Adopt validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8)"
+    - "Add env: VLLM_ROCM_USE_AITER_LINEAR=1 (alongside existing VLLM_ROCM_USE_AITER=1)"
+    - "Add server flags: --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling"
+    - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128"
+    - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)"
+    - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX

From 5d62ead0f8b3fbb33170b7ba9e67f7fb041d4c3c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 13 May 2026 13:04:46 -0700
Subject: [PATCH 2/2] Backfill PR #1373 link in perf-changelog

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4eb27e704..87993f5bc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2452,4 +2452,4 @@
     - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128"
     - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)"
     - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp8-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1373