diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f1cec3c52..372e2beec 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4331,7 +4331,7 @@ gptoss-fp4-h200-vllm: - { tp: 8, conc-start: 4, conc-end: 32 } minimaxm2.5-fp8-h200-vllm: - image: vllm/vllm-openai:v0.18.0 + image: vllm/vllm-openai:v0.20.1-ubuntu2404 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -4343,11 +4343,11 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index f901b1ff7..4849f5c3d 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -181,7 +181,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 ref: ${{ inputs.ref || github.sha }} clean: true diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4b7966df0..8551197c3 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -158,7 +158,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 ref: ${{ inputs.ref || github.sha }} clean: true diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml index 3b3f08bf1..5e241f175 100644 --- a/.github/workflows/collect-evals.yml +++ b/.github/workflows/collect-evals.yml @@ -18,7 +18,7 @@ jobs: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 - name: Download eval artifacts diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 353918609..a9f9817ad 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -19,7 +19,7 @@ jobs: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 - name: Download JSON artifacts diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 9c05340cf..e4114cfd0 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -352,12 +352,12 @@ jobs: env: RESULTS_DIR: "results/" STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 - name: Download results artifacts diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 4e799bf7f..e5c8f5817 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -392,12 +392,12 @@ jobs: env: RESULTS_DIR: "results/" STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - token: ${{ secrets.REPO_PAT }} + token: ${{ secrets.REPO_PAT || github.token }} fetch-depth: 0 - name: Download results artifacts @@ -488,6 +488,7 @@ jobs: always() && github.event_name == 'pull_request' && !github.event.pull_request.draft && + github.event.pull_request.head.repo.full_name == github.repository && ( contains(github.event.pull_request.labels.*.name, 'sweep-enabled') || contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 84e73b65c..0ed0c8fa3 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -27,23 +27,40 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -if [ "$EP_SIZE" -ge 1 ]; then - EP=" --enable-expert-parallel" +export PYTHONNOUSERSITE=1 +export SAFETENSORS_FAST_GPU=1 +export VLLM_USE_DEEP_GEMM=0 +export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0 +export VLLM_FLOAT32_MATMUL_PRECISION=high + +COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'} +MAX_NUM_SEQS=${MAX_NUM_SEQS:-512} +MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768} + +if [ "$EP_SIZE" -gt 1 ]; then + EP=(--enable-expert-parallel) else - EP=" " + EP=() fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor set -x -vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ +vllm serve "$MODEL" --port "$PORT" \ +--tensor-parallel-size="$TP" \ +"${EP[@]}" \ --gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ +--max-model-len "$MAX_MODEL_LEN" \ +--max-num-seqs "$MAX_NUM_SEQS" \ +--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ +--kv-cache-dtype fp8 \ +--moe-backend triton \ +--attention-backend FLASHINFER \ +--enable-flashinfer-autotune \ +--compilation-config "$COMPILATION_CONFIG" \ --no-enable-prefix-caching \ ---trust-remote-code > $SERVER_LOG 2>&1 & +--trust-remote-code > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ac8269ef2..e95c5ec2c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2343,3 +2343,10 @@ description: - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1310 + +- config-keys: + - minimaxm2.5-fp8-h200-vllm + description: + - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404" + - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1298