Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4331,7 +4331,7 @@ gptoss-fp4-h200-vllm:
- { tp: 8, conc-start: 4, conc-end: 32 }

minimaxm2.5-fp8-h200-vllm:
image: vllm/vllm-openai:v0.18.0
image: vllm/vllm-openai:v0.20.1-ubuntu2404
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: h200
Expand All @@ -4343,11 +4343,11 @@ minimaxm2.5-fp8-h200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 4, conc-start: 1, conc-end: 256 }

dsr1-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ jobs:

- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anish-shanbhag can u revert this plz

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry @functionstackx, CI was failing here since the source branch is from a fork; I've opened #1354 to supersede this PR

token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0
ref: ${{ inputs.ref || github.sha }}
clean: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ jobs:

- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0
ref: ${{ inputs.ref || github.sha }}
clean: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/collect-evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0

- name: Download eval artifacts
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/collect-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0

- name: Download JSON artifacts
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,12 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }}

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0

- name: Download results artifacts
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -392,12 +392,12 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
GITHUB_TOKEN: ${{ secrets.REPO_PAT || github.token }}

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
token: ${{ secrets.REPO_PAT || github.token }}
fetch-depth: 0

- name: Download results artifacts
Expand Down Expand Up @@ -488,6 +488,7 @@ jobs:
always() &&
github.event_name == 'pull_request' &&
!github.event.pull_request.draft &&
github.event.pull_request.head.repo.full_name == github.repository &&
(
contains(github.event.pull_request.labels.*.name, 'sweep-enabled') ||
contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled')
Expand Down
33 changes: 25 additions & 8 deletions benchmarks/single_node/minimaxm2.5_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,40 @@ if [ "${EVAL_ONLY}" = "true" ]; then
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

if [ "$EP_SIZE" -ge 1 ]; then
EP=" --enable-expert-parallel"
export PYTHONNOUSERSITE=1
export SAFETENSORS_FAST_GPU=1
export VLLM_USE_DEEP_GEMM=0
export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0
export VLLM_FLOAT32_MATMUL_PRECISION=high

COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'}
MAX_NUM_SEQS=${MAX_NUM_SEQS:-512}
MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768}

if [ "$EP_SIZE" -gt 1 ]; then
EP=(--enable-expert-parallel)
else
EP=" "
EP=()
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
$EP \
vllm serve "$MODEL" --port "$PORT" \
--tensor-parallel-size="$TP" \
"${EP[@]}" \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--kv-cache-dtype fp8 \
--moe-backend triton \
--attention-backend FLASHINFER \
--enable-flashinfer-autotune \
--compilation-config "$COMPILATION_CONFIG" \
--no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &
--trust-remote-code > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2343,3 +2343,10 @@
description:
- "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1310

- config-keys:
- minimaxm2.5-fp8-h200-vllm
description:
- "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404"
- "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1298