diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cf6709258..3d1a70d42 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4515,7 +4515,7 @@ gptoss-fp4-h100-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } minimaxm2.5-fp8-h100-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.19.1-cu130 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h100 @@ -4527,13 +4527,11 @@ minimaxm2.5-fp8-h100-vllm: - isl: 1024 osl: 1024 search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh index 5fd0482cf..258ec7dc1 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh @@ -9,7 +9,6 @@ check_env_vars \ CONC \ ISL \ OSL \ - MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME @@ -28,7 +27,6 @@ PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi if [ "$EP_SIZE" -gt 1 ]; then @@ -44,12 +42,13 @@ set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tensor-parallel-size=$TP \ $EP \ ---gpu-memory-utilization 0.90 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs 256 \ ---no-enable-prefix-caching \ --trust-remote-code \ ---compilation-config '{"cudagraph_mode":"PIECEWISE"}' > $SERVER_LOG 2>&1 & +--enable-auto-tool-choice \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2_append_think \ +--compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \ +--gpu-memory-utilization 0.9 \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0fa3b2cd8..58a7368db 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3113,3 +3113,13 @@ - "1k1k and 8k1k STP low-latency and max-throughput srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/ (ported from upstream srt-slurm PR #152)" - "Wire glm5/fp4 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1514 + +- config-keys: + - minimaxm2.5-fp8-h100-vllm + description: + - "Update minimaxm2.5-fp8-h100-vllm recipe (v0.19.1)" + - "Image: vllm/vllm-openai:v0.21.0 -> v0.19.1-cu130" + - "Replace recipe flags: drop PIECEWISE/0.90 mem util/256 max-num-seqs/no-prefix-caching/explicit max-model-len; add --enable-auto-tool-choice, --tool-call-parser minimax_m2, --reasoning-parser minimax_m2_append_think, --compilation-config mode:3+fuse_minimax_qk_norm" + - "Search-space: tp:8 ep:8 (TEP=8), conc-end 128 chosen at saturation per local sweep" + - "Local bench: TEP=8 peaks at C=128 with 26923 tot tps (+178% vs TEP=4 peak at C=32 in May 6 j11600242 sweep)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1516