Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,7 @@ gptoss-fp4-mi325x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.21.0
image: vllm/vllm-openai-rocm:v0.19.0
model: amd/gpt-oss-120b-w-mxfp4-a-fp8
model-prefix: gptoss
runner: mi355x
Expand All @@ -973,13 +973,15 @@ gptoss-fp4-mi355x-vllm:
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 4, conc-end: 8 }
- { tp: 2, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 8192
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 4, conc-end: 4 }
- { tp: 2, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 8 }

gptoss-fp4-mi355x-atom:
Expand Down
25 changes: 9 additions & 16 deletions benchmarks/single_node/gptoss_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,19 @@ fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

# If the machine runs a MEC FW older than 177, RCCL
# cannot reclaim some memory.
# Disable that features to avoid crashes.
# This is related to the changes in the driver at:
# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
if [[ "$version" == "" || $version -lt 177 ]]; then
export HSA_NO_SCRATCH_RECLAIM=1
fi

# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER_MOE=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
export HSA_NO_SCRATCH_RECLAIM=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
Expand All @@ -52,12 +44,13 @@ start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--max-num-seqs 256 \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &
--no-enable-prefix-caching \
--async-scheduling > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3136,3 +3136,11 @@
description:
- "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555

- config-keys:
- gptoss-fp4-mi355x-vllm
description:
- "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)"
- "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling"
- "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531