From b278bc2d4b3aa93271b33f456730e842c9ad50cf Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Wed, 6 May 2026 09:02:13 -0500 Subject: [PATCH 1/6] gpt-oss-fp4-mi355x: pin to v0.19 + switch to AITER-env-based recipe Pins the image back to vllm/vllm-openai-rocm:v0.19.0 (was bumped to v0.21.0 in #1406). v0.21 introduces a ROCm/AITER perf regression on MI355x for gpt-oss that we're still tracking down; staying on v0.19 in the meantime. Also rewrites the launcher to enable the AITER kernel paths via env vars (AITER MOE/RMSNorm/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM=1) and drops the now-obsolete TRITON_ROPE/BUFFER_OPS/--attention-backend/ fuse_rope_kvcache/use_inductor_graph_partition bits. Also adds --max-num-seqs 256 and --async-scheduling. --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/gptoss_fp4_mi355x.sh | 25 ++++++++------------- perf-changelog.yaml | 7 ++++++ 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 418ad5ab9..e6c77606b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -960,7 +960,7 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.19.0 model: amd/gpt-oss-120b-w-mxfp4-a-fp8 model-prefix: gptoss runner: mi355x diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 3db687e22..05c5a2157 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -18,27 +18,19 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - # Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 +export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" -FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" +export HSA_NO_SCRATCH_RECLAIM=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -52,12 +44,13 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ - $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ --tensor-parallel-size=$TP \ + --max-num-seqs 256 \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ - --no-enable-prefix-caching > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching \ + --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8acd720cd..c1451478a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3083,6 +3083,13 @@ - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546 +- config-keys: + - gptoss-fp4-mi355x-vllm + description: + - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" + - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - config-keys: - dsv4-fp4-mi355x-sglang description: From e1d96b891e756c731d09f4a1a9e919e465076b0d Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Wed, 20 May 2026 10:01:55 -0500 Subject: [PATCH 2/6] update PR number --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c1451478a..08b6fe060 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3088,7 +3088,7 @@ description: - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531 - config-keys: - dsv4-fp4-mi355x-sglang From addd9d857ef27407bbc0a7cfc292764ac9c81a34 Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Thu, 21 May 2026 07:43:51 -0500 Subject: [PATCH 3/6] adds a pre-flight warmup pass before the measured benchmark --- benchmarks/single_node/gptoss_fp4_mi355x.sh | 18 ++++++++++++++++++ perf-changelog.yaml | 1 + 2 files changed, 19 insertions(+) diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 05c5a2157..c6d25ea6e 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -57,6 +57,24 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +# Pre-flight warmup at the benchmark shape (5 rounds of CONC parallel prompts) +# so the actual measurement below starts at steady-state. Without this, AITER's +# per-shape Triton JIT autotune and torch.compile cache misses bias the first +# ~20-40s of the real benchmark window low (CI cold-cache measurements showed +# a ~10% under-report). vllm bench's built-in --num-warmups defaults to 16 +# serial prompts, which doesn't exercise the full-concurrency kernel variants. +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 5))" \ + --max-concurrency "$CONC" \ + --result-filename "warmup_$RESULT_FILENAME" \ + --result-dir /tmp/ > /dev/null 2>&1 || true + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 08b6fe060..3fd3a1c51 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3088,6 +3088,7 @@ description: - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" + - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531 - config-keys: From efd9616cef4ec2d653c80a45288362a48bf62b2c Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Sun, 24 May 2026 06:03:21 -0500 Subject: [PATCH 4/6] extend sweep search-space to match B200 coverage --- .github/configs/amd-master.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e6c77606b..dec56ac51 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -973,13 +973,15 @@ gptoss-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-mi355x-atom: From e658541befe4901d87f224ed02ee7dc4b528f2d3 Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Mon, 25 May 2026 05:41:27 -0500 Subject: [PATCH 5/6] move PR 1531 entry to bottom (addresses bot review) --- perf-changelog.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3fd3a1c51..07fe677bb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3083,14 +3083,6 @@ - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546 -- config-keys: - - gptoss-fp4-mi355x-vllm - description: - - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" - - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" - - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531 - - config-keys: - dsv4-fp4-mi355x-sglang description: @@ -3144,3 +3136,11 @@ description: - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 + +- config-keys: + - gptoss-fp4-mi355x-vllm + description: + - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" + - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" + - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531 From 8b8ff70570a1eb0035d5a5731ad88b22b968a745 Mon Sep 17 00:00:00 2001 From: Xiaohu Guo Date: Mon, 25 May 2026 11:36:28 -0500 Subject: [PATCH 6/6] revert pre-flight warmup pass --- benchmarks/single_node/gptoss_fp4_mi355x.sh | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index c6d25ea6e..05c5a2157 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -57,24 +57,6 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# Pre-flight warmup at the benchmark shape (5 rounds of CONC parallel prompts) -# so the actual measurement below starts at steady-state. Without this, AITER's -# per-shape Triton JIT autotune and torch.compile cache misses bias the first -# ~20-40s of the real benchmark window low (CI cold-cache measurements showed -# a ~10% under-report). vllm bench's built-in --num-warmups defaults to 16 -# serial prompts, which doesn't exercise the full-concurrency kernel variants. -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 5))" \ - --max-concurrency "$CONC" \ - --result-filename "warmup_$RESULT_FILENAME" \ - --result-dir /tmp/ > /dev/null 2>&1 || true - run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \