Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1756,30 +1756,43 @@ dsv4-fp4-mi355x-sglang:
- { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 }
- { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 }

# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with
# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch
# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a
# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm
# MI355X image and remove the build step.
dsv4-fp8-mi355x-vllm:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
# on 2026-05-05, so any nightly built after that includes the
# DeepseekV4ForCausalLM model class.
#
# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
# files keyed on the image string and short-circuits re-import if the
# file already exists, so the floating tag silently keeps a stale build
# even after Docker Hub updates `:nightly`.
#
# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
# rest); InferenceX classifies this as fp4 — same as the sister sglang
# and atom DSv4 mi355x entries below. Image and serving flags follow the
# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
# executor, triton_unfused MoE (required for the FP4 expert format),
# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
# probe to validate the ROCm DP+EP path.
dsv4-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x
precision: fp8
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 1 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 1 }
- { tp: 8, conc-start: 4, conc-end: 128 }

# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
Expand Down
118 changes: 118 additions & 0 deletions benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env bash
set -eo pipefail

# DeepSeek-V4-Pro on MI355X via vLLM.
# The DeepSeek-V4-Pro checkpoint is mixed-precision FP4+FP8 (FP4 MoE
# expert weights dominate the ~960 GB footprint, FP8 on attention/norm/
# router, FP8 KV cache at runtime). InferenceX classifies this as the
# fp4 variant.
#
# Serving flags follow the validated MI355X recipe from
# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). DEP probes reuse the
# same ROCm recipe while switching parallelism to vLLM's DP+EP form.
# Image-pin details live in amd-master.yaml.
#
# --moe-backend triton_unfused is required for the FP4 MoE expert
# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend
# default to auto picks a backend that doesn't register the FP4 scale
# parameters (w13_weight_scale / w2_weight_scale), so safetensors
# loading raises KeyError.
#
# --quantization deepseek_v4_fp8 forces the FP4-aware
# DeepseekV4FP8Config instead of relying on model_type auto-detection.
# That keeps the mixed-precision checkpoint on the intended MoE path
# and avoids falling back to plain Fp8Config, which rejects
# triton_unfused.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_LINEAR=1
# Loading the ~960 GB checkpoint into KV/weights can exceed the default
# engine-ready timeout on first run from cold HF cache.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor

PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

set -x
vllm serve $MODEL --port $PORT \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
--distributed-executor-backend mp \
--gpu-memory-utilization 0.6 \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs 128 \
--max-num-batched-tokens 8192 \
--kv-cache-dtype fp8 \
--trust-remote-code \
--enforce-eager \
--async-scheduling \
--quantization deepseek_v4_fp8 \
--moe-backend triton_unfused \
--no-enable-prefix-caching \
--tokenizer-mode deepseek_v4 \
--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
Loading
Loading