SemiAnalysisAI · xiaohuguo2023 · May 6, 2026 · May 20, 2026 · May 21, 2026 · May 24, 2026
@@ -960,7 +960,7 @@ gptoss-fp4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
 gptoss-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.19.0
   model: amd/gpt-oss-120b-w-mxfp4-a-fp8
   model-prefix: gptoss
   runner: mi355x
@@ -973,13 +973,15 @@ gptoss-fp4-mi355x-vllm:
       osl: 1024
       search-space:
       - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
+      - { tp: 2, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 4 }
+      - { tp: 2, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-mi355x-atom:

diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -18,27 +18,19 @@ fi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
 # Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
 if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-export AMDGCN_USE_BUFFER_OPS=0
 export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
+export VLLM_ROCM_USE_AITER_MOE=1
+export VLLM_ROCM_USE_AITER_RMSNORM=1
+export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+export VLLM_ROCM_USE_AITER_MHA=0
+export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
-FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
+export HSA_NO_SCRATCH_RECLAIM=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,12 +44,13 @@ start_gpu_monitor
 
 set -x
 vllm serve $MODEL --port $PORT \
-  $ATTN_BACKEND $FUSE_ROPE_KVCACHE \
   --tensor-parallel-size=$TP \
+  --max-num-seqs 256 \
   --gpu-memory-utilization 0.95 \
   --max-model-len $MAX_MODEL_LEN \
   --block-size=64 \
-  --no-enable-prefix-caching > $SERVER_LOG 2>&1 &
+  --no-enable-prefix-caching \
+  --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3136,3 +3136,11 @@
   description:
     - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555
+
+- config-keys:
+    - gptoss-fp4-mi355x-vllm
+  description:
+    - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
+    - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
+    - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531