vllm-project · faradawn · May 21, 2026 · May 21, 2026 · gemini-code-assist · May 21, 2026
diff --git a/models/MiniMaxAI/MiniMax-M2.5.yaml b/models/MiniMaxAI/MiniMax-M2.5.yaml
@@ -3,7 +3,7 @@ meta:
   slug: "minimax-m2.5"
   provider: "MiniMax"
   description: "MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint"
-  date_updated: 2026-05-18
+  date_updated: 2026-05-21
   difficulty: intermediate
   tasks:
     - text
@@ -27,7 +27,7 @@ model:
   base_args:
     - "--trust-remote-code"
     - "--compilation-config"
-    - '{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'
+    - '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}'
   base_env: {}
 
 features:
@@ -41,7 +41,7 @@ features:
     description: "MiniMax M2 reasoning parser for chain-of-thought extraction"
     args:
       - "--reasoning-parser"
-      - "minimax_m2"
+      - "minimax_m2_append_think"
 
 opt_in_features: []
 
@@ -150,9 +150,9 @@ guide: |
         --attention-backend FLASHINFER \
         --enable-flashinfer-autotune \
         --tool-call-parser minimax_m2 \
-        --reasoning-parser minimax_m2 \
+        --reasoning-parser minimax_m2_append_think \
         --enable-auto-tool-choice \
-        --compilation-config '{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}' \
+        --compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \
         --trust-remote-code
   ```
 
@@ -183,18 +183,29 @@ guide: |
 
   Pure TP8 is not supported. For >4 GPUs use DP+EP or TP+EP.
 
-  ### TP4+EP (recommended for H100)
+  ### TEP=8 (recommended for H100)
 
   ```bash
   vllm serve MiniMaxAI/MiniMax-M2.5 \
-    --tensor-parallel-size 4 \
+    --tensor-parallel-size 8 \
     --enable-expert-parallel \
     --tool-call-parser minimax_m2 \
-    --reasoning-parser minimax_m2 \
+    --reasoning-parser minimax_m2_append_think \
     --compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \
     --enable-auto-tool-choice
   ```
 
+  ### H100 NVFP4 (TP=8)
+
+  ```bash
+  vllm serve nvidia/MiniMax-M2.5-NVFP4 \
+    --tensor-parallel-size 8 \
+    --tool-call-parser minimax_m2 \
+    --reasoning-parser minimax_m2_append_think \
+    --enable-auto-tool-choice \
+    --trust-remote-code
+  ```
+
   ### AMD ROCm
 
   ```bash