SemiAnalysisAI · Oseltamivir · May 19, 2026 · May 13, 2026 · May 13, 2026 · May 14, 2026
@@ -1756,30 +1756,43 @@ dsv4-fp4-mi355x-sglang:
       - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 }
 
-# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
-# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with
-# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch
-# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a
-# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm
-# MI355X image and remove the build step.
-dsv4-fp8-mi355x-vllm:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
+# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
+# on 2026-05-05, so any nightly built after that includes the
+# DeepseekV4ForCausalLM model class.
+#
+# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
+# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
+# files keyed on the image string and short-circuits re-import if the
+# file already exists, so the floating tag silently keeps a stale build
+# even after Docker Hub updates `:nightly`.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
+# rest); InferenceX classifies this as fp4 — same as the sister sglang
+# and atom DSv4 mi355x entries below. Image and serving flags follow the
+# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
+# executor, triton_unfused MoE (required for the FP4 expert format),
+# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
+# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
+# probe to validate the ROCm DP+EP path.
+dsv4-fp4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
-  precision: fp8
+  precision: fp4
   framework: vllm
   multinode: false
   scenarios:
     fixed-seq-len:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 128 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 1 }
+      - { tp: 8, conc-start: 4, conc-end: 128 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# DeepSeek-V4-Pro on MI355X via vLLM.
+# The DeepSeek-V4-Pro checkpoint is mixed-precision FP4+FP8 (FP4 MoE
+# expert weights dominate the ~960 GB footprint, FP8 on attention/norm/
+# router, FP8 KV cache at runtime). InferenceX classifies this as the
+# fp4 variant.
+#
+# Serving flags follow the validated MI355X recipe from
+# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). DEP probes reuse the
+# same ROCm recipe while switching parallelism to vLLM's DP+EP form.
+# Image-pin details live in amd-master.yaml.
+#
+# --moe-backend triton_unfused is required for the FP4 MoE expert
+# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend
+# default to auto picks a backend that doesn't register the FP4 scale
+# parameters (w13_weight_scale / w2_weight_scale), so safetensors
+# loading raises KeyError.
+#
+# --quantization deepseek_v4_fp8 forces the FP4-aware
+# DeepseekV4FP8Config instead of relying on model_type auto-detection.
+# That keeps the mixed-precision checkpoint on the intended MoE path
+# and avoids falling back to plain Fp8Config, which rejects
+# triton_unfused.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+
+if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_LINEAR=1
+# Loading the ~960 GB checkpoint into KV/weights can exceed the default
+# engine-ready timeout on first run from cold HF cache.
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+set -x
+vllm serve $MODEL --port $PORT \
+    "${PARALLEL_ARGS[@]}" \
+    "${EP_ARGS[@]}" \
+    --distributed-executor-backend mp \
+    --gpu-memory-utilization 0.6 \
+    --max-model-len $MAX_MODEL_LEN \
+    --max-num-seqs 128 \
+    --max-num-batched-tokens 8192 \
+    --kv-cache-dtype fp8 \
+    --trust-remote-code \
+    --enforce-eager \
+    --async-scheduling \
+    --quantization deepseek_v4_fp8 \
+    --moe-backend triton_unfused \
+    --no-enable-prefix-caching \
+    --tokenizer-mode deepseek_v4 \
+    --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x