diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3d1a70d42..f8cc486b2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4754,11 +4754,11 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } # Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is # identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 447a4510e..2e87cd828 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -27,23 +27,40 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -if [ "$EP_SIZE" -ge 1 ]; then - EP=" --enable-expert-parallel" +export PYTHONNOUSERSITE=1 +export SAFETENSORS_FAST_GPU=1 +export VLLM_USE_DEEP_GEMM=0 +export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0 +export VLLM_FLOAT32_MATMUL_PRECISION=high + +COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'} +MAX_NUM_SEQS=${MAX_NUM_SEQS:-512} +MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768} + +if [ "$EP_SIZE" -gt 1 ]; then + EP=(--enable-expert-parallel) else - EP=" " + EP=() fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor set -x -vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ +vllm serve "$MODEL" --port "$PORT" \ +--tensor-parallel-size="$TP" \ +"${EP[@]}" \ --gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ +--max-model-len "$MAX_MODEL_LEN" \ +--max-num-seqs "$MAX_NUM_SEQS" \ +--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ +--kv-cache-dtype fp8 \ +--moe-backend triton \ +--attention-backend FLASHINFER \ +--enable-flashinfer-autotune \ +--compilation-config "$COMPILATION_CONFIG" \ --no-enable-prefix-caching \ ---trust-remote-code > $SERVER_LOG 2>&1 & +--trust-remote-code > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 208a2da6f..614b6104e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3129,3 +3129,11 @@ description: - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 + +- config-keys: + - minimaxm2.5-fp8-h200-vllm + description: + - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404" + - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 +