diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 418ad5ab9..dec56ac51 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -960,7 +960,7 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.19.0 model: amd/gpt-oss-120b-w-mxfp4-a-fp8 model-prefix: gptoss runner: mi355x @@ -973,13 +973,15 @@ gptoss-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-mi355x-atom: diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 3db687e22..05c5a2157 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -18,27 +18,19 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - # Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 +export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" -FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" +export HSA_NO_SCRATCH_RECLAIM=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -52,12 +44,13 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ - $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ --tensor-parallel-size=$TP \ + --max-num-seqs 256 \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ - --no-enable-prefix-caching > $SERVER_LOG 2>&1 & + --no-enable-prefix-caching \ + --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8acd720cd..07fe677bb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3136,3 +3136,11 @@ description: - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 + +- config-keys: + - gptoss-fp4-mi355x-vllm + description: + - "Pin image back to vllm/vllm-openai-rocm:v0.19.0 while we work through a v0.21 perf regression on MI355x AITER paths (image was bumped to v0.21.0 in #1406)" + - "Switch to AITER-env-based recipe: enable AITER MOE/RMSNORM/UnifiedAttn/A16W4 + HSA_NO_SCRATCH_RECLAIM, drop legacy TRITON_ROPE/BUFFER_OPS/--attention-backend/fuse_rope_kvcache, add --max-num-seqs 256 + --async-scheduling" + - "Add a 5-round pre-flight warmup pass at the benchmark shape (CONC*5 prompts at CONC concurrency, result discarded) before the measured benchmark. AITER's per-shape Triton JIT autotune doesn't finish within vllm bench's built-in 16-prompt warmup, biasing single-shot CI measurements low by ~10%" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1531