Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1524,7 +1524,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_MTP_SIZE=2"

kimik2.5-fp4-mi355x-vllm-disagg:
Comment thread
simondanielsson marked this conversation as resolved.
image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
# TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344
# is part of official release, likely 0.22.0.
image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x-disagg
Comment thread
simondanielsson marked this conversation as resolved.
Expand All @@ -1547,7 +1549,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1568,7 +1569,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1578,7 +1578,9 @@ kimik2.5-fp4-mi355x-vllm-disagg:
- "DECODE_NODES=2"

minimaxm2.5-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
# TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u have an specific night hash pinned here here instead of just the generic "nightly"

# is part of official release, likely 0.22.0.
image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x-disagg
Expand All @@ -1603,7 +1605,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1624,7 +1625,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand Down
3 changes: 1 addition & 2 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"

# vLLM external router container
VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}"
ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"

Expand Down Expand Up @@ -399,7 +399,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
-e UCX_LOG_LEVEL=warn
-e HSA_ENABLE_SDMA=1
-e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
-e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
-e PYTHONPYCACHEPREFIX=/tmp/pycache
)
else
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/multi_node/amd_utils/models_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:

Kimi-K2.5-MXFP4:
prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
hf_dir: "models--amd--Kimi-K2.5-MXFP4"

MiniMax-M2.5:
# AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
# Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
hf_dir: "models--MiniMaxAI--MiniMax-M2.5"

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/multi_node/amd_utils/server_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${PREFILL_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down Expand Up @@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${PREFILL_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down Expand Up @@ -478,7 +478,7 @@ else
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${DECODE_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down
Loading