From b278bc2d4b3aa93271b33f456730e842c9ad50cf Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Wed, 6 May 2026 09:02:13 -0500
Subject: [PATCH 1/6] gpt-oss-fp4-mi355x: pin to v0.19 + switch to
 AITER-env-based recipe

Pins the image back to vllm/vllm-openai-rocm:v0.19.0 (was bumped to
v0.21.0 in #1406). v0.21 introduces a ROCm/AITER perf regression on
MI355x for gpt-oss that we're still tracking down; staying on v0.19
in the meantime.

Also rewrites the launcher to enable the AITER kernel paths via env
vars (AITER MOE/RMSNorm/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM=1)
and drops the now-obsolete TRITON_ROPE/BUFFER_OPS/--attention-backend/
fuse_rope_kvcache/use_inductor_graph_partition bits. Also adds
--max-num-seqs 256 and --async-scheduling.
---
 .github/configs/amd-master.yaml             |  2 +-
 benchmarks/single_node/gptoss_fp4_mi355x.sh | 25 ++++++++-------------
 perf-changelog.yaml                         |  7 ++++++
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 418ad5ab9..e6c77606b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -960,7 +960,7 @@ gptoss-fp4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
 gptoss-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.19.0
   model: amd/gpt-oss-120b-w-mxfp4-a-fp8
   model-prefix: gptoss
   runner: mi355x
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index 3db687e22..05c5a2157 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -18,27 +18,19 @@ fi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
 # Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
 if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-export AMDGCN_USE_BUFFER_OPS=0
 export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
+export VLLM_ROCM_USE_AITER_MOE=1
+export VLLM_ROCM_USE_AITER_RMSNORM=1
+export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+export VLLM_ROCM_USE_AITER_MHA=0
+export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
-FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
+export HSA_NO_SCRATCH_RECLAIM=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,12 +44,13 @@ start_gpu_monitor
 
 set -x
 vllm serve $MODEL --port $PORT \
-  $ATTN_BACKEND $FUSE_ROPE_KVCACHE \
   --tensor-parallel-size=$TP \
+  --max-num-seqs 256 \
   --gpu-memory-utilization 0.95 \
   --max-model-len $MAX_MODEL_LEN \
   --block-size=64 \
-  --no-enable-prefix-caching > $SERVER_LOG 2>&1 &
+  --no-enable-prefix-caching \
+  --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8acd720cd..c1451478a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3083,6 +3083,13 @@
     - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546
 
+- config-keys:
+    - gptoss-fp4-mi355x-vllm
+  description:
+    - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
+    - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description:

From e1d96b891e756c731d09f4a1a9e919e465076b0d Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Wed, 20 May 2026 10:01:55 -0500
Subject: [PATCH 2/6] update PR number

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c1451478a..08b6fe060 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3088,7 +3088,7 @@
   description:
     - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
     - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531
 
 - config-keys:
     - dsv4-fp4-mi355x-sglang

From addd9d857ef27407bbc0a7cfc292764ac9c81a34 Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Thu, 21 May 2026 07:43:51 -0500
Subject: [PATCH 3/6] adds a pre-flight warmup pass before the measured
 benchmark

---
 benchmarks/single_node/gptoss_fp4_mi355x.sh | 18 ++++++++++++++++++
 perf-changelog.yaml                         |  1 +
 2 files changed, 19 insertions(+)

diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index 05c5a2157..c6d25ea6e 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -57,6 +57,24 @@ SERVER_PID=$!
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
+# Pre-flight warmup at the benchmark shape (5 rounds of CONC parallel prompts)
+# so the actual measurement below starts at steady-state. Without this, AITER's
+# per-shape Triton JIT autotune and torch.compile cache misses bias the first
+# ~20-40s of the real benchmark window low (CI cold-cache measurements showed
+# a ~10% under-report). vllm bench's built-in --num-warmups defaults to 16
+# serial prompts, which doesn't exercise the full-concurrency kernel variants.
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 5))" \
+    --max-concurrency "$CONC" \
+    --result-filename "warmup_$RESULT_FILENAME" \
+    --result-dir /tmp/ > /dev/null 2>&1 || true
+
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 08b6fe060..3fd3a1c51 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3088,6 +3088,7 @@
   description:
     - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
     - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
+    - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531
 
 - config-keys:

From efd9616cef4ec2d653c80a45288362a48bf62b2c Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Sun, 24 May 2026 06:03:21 -0500
Subject: [PATCH 4/6] extend sweep search-space to match B200 coverage

---
 .github/configs/amd-master.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e6c77606b..dec56ac51 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -973,13 +973,15 @@ gptoss-fp4-mi355x-vllm:
       osl: 1024
       search-space:
       - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
+      - { tp: 2, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 4 }
+      - { tp: 2, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-mi355x-atom:

From e658541befe4901d87f224ed02ee7dc4b528f2d3 Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Mon, 25 May 2026 05:41:27 -0500
Subject: [PATCH 5/6] move PR 1531 entry to bottom (addresses bot review)

---
 perf-changelog.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3fd3a1c51..07fe677bb 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3083,14 +3083,6 @@
     - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546
 
-- config-keys:
-    - gptoss-fp4-mi355x-vllm
-  description:
-    - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
-    - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
-    - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531
-
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description:
@@ -3144,3 +3136,11 @@
   description:
     - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555
+
+- config-keys:
+    - gptoss-fp4-mi355x-vllm
+  description:
+    - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
+    - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
+    - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531

From 8b8ff70570a1eb0035d5a5731ad88b22b968a745 Mon Sep 17 00:00:00 2001
From: Xiaohu Guo <Xiaohu.Guo@amd.com>
Date: Mon, 25 May 2026 11:36:28 -0500
Subject: [PATCH 6/6] revert pre-flight warmup pass

---
 benchmarks/single_node/gptoss_fp4_mi355x.sh | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index c6d25ea6e..05c5a2157 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -57,24 +57,6 @@ SERVER_PID=$!
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# Pre-flight warmup at the benchmark shape (5 rounds of CONC parallel prompts)
-# so the actual measurement below starts at steady-state. Without this, AITER's
-# per-shape Triton JIT autotune and torch.compile cache misses bias the first
-# ~20-40s of the real benchmark window low (CI cold-cache measurements showed
-# a ~10% under-report). vllm bench's built-in --num-warmups defaults to 16
-# serial prompts, which doesn't exercise the full-concurrency kernel variants.
-run_benchmark_serving \
-    --model "$MODEL" \
-    --port "$PORT" \
-    --backend vllm \
-    --input-len "$ISL" \
-    --output-len "$OSL" \
-    --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 5))" \
-    --max-concurrency "$CONC" \
-    --result-filename "warmup_$RESULT_FILENAME" \
-    --result-dir /tmp/ > /dev/null 2>&1 || true
-
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \