SemiAnalysisAI · anish-shanbhag · May 7, 2026 · May 12, 2026 · functionstackx · May 12, 2026
@@ -4331,7 +4331,7 @@ gptoss-fp4-h200-vllm:
       - { tp: 8, conc-start: 4, conc-end: 32 }
 
 minimaxm2.5-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.18.0
+  image: vllm/vllm-openai:v0.20.1-ubuntu2404
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h200
@@ -4343,11 +4343,11 @@ minimaxm2.5-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
 
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -181,7 +181,7 @@ jobs:
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
-          token: ${{ secrets.REPO_PAT }}
+          token: ${{ secrets.REPO_PAT || github.token }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.sha }}
           clean: true

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -158,7 +158,7 @@ jobs:
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
-          token: ${{ secrets.REPO_PAT }}
+          token: ${{ secrets.REPO_PAT || github.token }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.sha }}
           clean: true

diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
@@ -18,7 +18,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
-          token: ${{ secrets.REPO_PAT }}
+          token: ${{ secrets.REPO_PAT || github.token }}
           fetch-depth: 0
 
       - name: Download eval artifacts

diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
-          token: ${{ secrets.REPO_PAT }}
+          token: ${{ secrets.REPO_PAT || github.token }}
           fetch-depth: 0
 
       - name: Download JSON artifacts

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -352,12 +352,12 @@ jobs:
         env:
             RESULTS_DIR: "results/"
             STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }}
 
         steps:
             - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
               with:
-                  token: ${{ secrets.REPO_PAT }}
+                  token: ${{ secrets.REPO_PAT || github.token }}
                   fetch-depth: 0
 
             - name: Download results artifacts

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
@@ -392,12 +392,12 @@ jobs:
         env:
             RESULTS_DIR: "results/"
             STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }}
 
         steps:
             - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
               with:
-                  token: ${{ secrets.REPO_PAT }}
+                  token: ${{ secrets.REPO_PAT || github.token }}
                   fetch-depth: 0
 
             - name: Download results artifacts
@@ -488,6 +488,7 @@ jobs:
             always() &&
             github.event_name == 'pull_request' &&
             !github.event.pull_request.draft &&
+            github.event.pull_request.head.repo.full_name == github.repository &&
             (
               contains(github.event.pull_request.labels.*.name, 'sweep-enabled') ||
               contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled')

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
@@ -27,23 +27,40 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
-if [ "$EP_SIZE" -ge 1 ]; then
-  EP=" --enable-expert-parallel"
+export PYTHONNOUSERSITE=1
+export SAFETENSORS_FAST_GPU=1
+export VLLM_USE_DEEP_GEMM=0
+export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0
+export VLLM_FLOAT32_MATMUL_PRECISION=high
+
+COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'}
+MAX_NUM_SEQS=${MAX_NUM_SEQS:-512}
+MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768}
+
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=(--enable-expert-parallel)
 else
-  EP=" "
+  EP=()
 fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --port $PORT \
---tensor-parallel-size=$TP \
-$EP \
+vllm serve "$MODEL" --port "$PORT" \
+--tensor-parallel-size="$TP" \
+"${EP[@]}" \
 --gpu-memory-utilization 0.95 \
---max-model-len $MAX_MODEL_LEN \
+--max-model-len "$MAX_MODEL_LEN" \
+--max-num-seqs "$MAX_NUM_SEQS" \
+--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
+--kv-cache-dtype fp8 \
+--moe-backend triton \
+--attention-backend FLASHINFER \
+--enable-flashinfer-autotune \
+--compilation-config "$COMPILATION_CONFIG" \
 --no-enable-prefix-caching \
---trust-remote-code > $SERVER_LOG 2>&1 &
+--trust-remote-code > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2343,3 +2343,10 @@
   description:
     - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1310
+
+- config-keys:
+    - minimaxm2.5-fp8-h200-vllm
+  description:
+    - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404"
+    - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1298