SemiAnalysisAI · Ankur-singh · May 26, 2026 · May 7, 2026 · May 26, 2026
@@ -4754,11 +4754,11 @@ minimaxm2.5-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
 
 # Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
 # identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
@@ -27,23 +27,40 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
-if [ "$EP_SIZE" -ge 1 ]; then
-  EP=" --enable-expert-parallel"
+export PYTHONNOUSERSITE=1
+export SAFETENSORS_FAST_GPU=1
+export VLLM_USE_DEEP_GEMM=0
+export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0
+export VLLM_FLOAT32_MATMUL_PRECISION=high
+
+COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'}
+MAX_NUM_SEQS=${MAX_NUM_SEQS:-512}
+MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768}
+
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=(--enable-expert-parallel)
 else
-  EP=" "
+  EP=()
 fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --port $PORT \
---tensor-parallel-size=$TP \
-$EP \
+vllm serve "$MODEL" --port "$PORT" \
+--tensor-parallel-size="$TP" \
+"${EP[@]}" \
 --gpu-memory-utilization 0.95 \
---max-model-len $MAX_MODEL_LEN \
+--max-model-len "$MAX_MODEL_LEN" \
+--max-num-seqs "$MAX_NUM_SEQS" \
+--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
+--kv-cache-dtype fp8 \
+--moe-backend triton \
+--attention-backend FLASHINFER \
+--enable-flashinfer-autotune \
+--compilation-config "$COMPILATION_CONFIG" \
 --no-enable-prefix-caching \
---trust-remote-code > $SERVER_LOG 2>&1 &
+--trust-remote-code > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3129,3 +3129,11 @@
   description:
     - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555
+
+- config-keys:
+    - minimaxm2.5-fp8-h200-vllm
+  description:
+    - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404"
+    - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354
+