From 38961b04a68a195dcad07611cdfc31b3962bc202 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 16:53:38 -0700 Subject: [PATCH 01/14] Rename dsv4-fp8-mi355x-vllm to dsv4-fp4-mi355x-vllm and adopt recipes#433 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deepseek-ai/DeepSeek-V4-Pro checkpoint is FP4+FP8 mixed (FP4 MoE expert weights dominate the ~960 GB footprint, with FP8 only on attention/norm/router and FP8 KV cache). Reclassify the vLLM MI355X benchmark as fp4 — matching dsv4-fp4-mi355x-sglang and dsv4-fp4-mi355x-atom, which use the same checkpoint. Also apply the validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): * Rename benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh -> dsv4_fp4_mi355x_vllm.sh; remove dsv4-fp8-mi355x-vllm from amd-master.yaml; add dsv4-fp4-mi355x-vllm next to its fp4 siblings * Add VLLM_ROCM_USE_AITER_LINEAR=1 env var * Add --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling server flags * Tune --gpu-memory-utilization 0.90 -> 0.6 and --max-num-seqs 32 -> 128 * Drop --tool-call-parser / --enable-auto-tool-choice (not in recipe, not exercised by throughput benchmarks) * Expand sweep from conc=1 to conc 4-64 to match dsv4-fp4-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 allows it --- .github/configs/amd-master.yaml | 18 ++++++++++----- ...mi355x_vllm.sh => dsv4_fp4_mi355x_vllm.sh} | 22 ++++++++++++++----- perf-changelog.yaml | 13 +++++++++++ 3 files changed, 43 insertions(+), 10 deletions(-) rename benchmarks/single_node/{dsv4_fp8_mi355x_vllm.sh => dsv4_fp4_mi355x_vllm.sh} (95%) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 692725bc1..1935eebe5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1614,15 +1614,23 @@ dsv4-fp4-mi355x-sglang: # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, # stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with # MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch -# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a +# at runtime by benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh at a # pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm # MI355X image and remove the build step. -dsv4-fp8-mi355x-vllm: +# +# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the +# rest); InferenceX classifies this as fp4 — same as the sister sglang +# and atom DSv4 mi355x entries below. Serving flags follow the validated +# recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp executor, +# triton_unfused MoE, async scheduling, max-num-seqs=128, +# max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the +# sister sglang config (conc 4-64) so vLLM↔SGLang are comparable. +dsv4-fp4-mi355x-vllm: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x - precision: fp8 + precision: fp4 framework: vllm multinode: false scenarios: @@ -1630,11 +1638,11 @@ dsv4-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } + - { tp: 8, conc-start: 4, conc-end: 64 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh similarity index 95% rename from benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh rename to benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index 642700a52..66bf0d7c2 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -1,10 +1,20 @@ #!/usr/bin/env bash set -eo pipefail -# DeepSeek-V4-Pro FP8 on MI355X via vLLM with AITER MLA decode. +# DeepSeek-V4-Pro on MI355X via vLLM with AITER MLA decode. +# The DeepSeek-V4-Pro checkpoint is mixed-precision FP4+FP8: MoE expert +# weights are stored in FP4 (the dominant storage class for this +# ~960GB model), with attention/norm/router params in FP8 and KV cache +# in FP8 at runtime. InferenceX classifies this as the fp4 variant. +# # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, # stacked on #40871 which adds base DSv4 ROCm support). # +# Serving flags follow the validated MI355X recipe from +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): AITER + AITER_LINEAR, +# triton_unfused MoE, mp executor, async scheduling, max-num-seqs=128, +# max-num-batched-tokens=8192, gpu-mem-util=0.6. +# # Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10, # aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from # the PR branch on top. Once both PRs merge into a release, switch to @@ -33,6 +43,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_LINEAR=1 export VLLM_TARGET_DEVICE=rocm export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_PLUGINS="" @@ -487,17 +498,18 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size $TP \ - --gpu-memory-utilization 0.90 \ + --distributed-executor-backend mp \ + --gpu-memory-utilization 0.6 \ --max-model-len $MAX_MODEL_LEN \ + --max-num-seqs 128 \ + --max-num-batched-tokens 8192 \ --kv-cache-dtype fp8 \ --trust-remote-code \ --enforce-eager \ + --async-scheduling \ --moe-backend "triton_unfused" \ --no-enable-prefix-caching \ - --max-num-seqs 32 \ --tokenizer-mode deepseek_v4 \ - --tool-call-parser deepseek_v4 \ - --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a7e673eb1..68cc38b91 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2454,3 +2454,16 @@ description: - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1329 + +- config-keys: + - dsv4-fp4-mi355x-vllm + description: + - "Reclassify the DeepSeek-V4-Pro mi355x vLLM benchmark from fp8 to fp4 — the deepseek-ai/DeepSeek-V4-Pro checkpoint is FP4+FP8 mixed (FP4 MoE expert weights dominate), matching the sister dsv4-fp4-mi355x-sglang and dsv4-fp4-mi355x-atom entries" + - "Replace config key dsv4-fp8-mi355x-vllm with dsv4-fp4-mi355x-vllm; rename benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh -> dsv4_fp4_mi355x_vllm.sh" + - "Adopt validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8)" + - "Add env: VLLM_ROCM_USE_AITER_LINEAR=1 (alongside existing VLLM_ROCM_USE_AITER=1)" + - "Add server flags: --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling" + - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128" + - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)" + - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp4-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX From 5480cf5538f1eef0fe0eed605df05c51a208d829 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 16:54:06 -0700 Subject: [PATCH 02/14] Backfill PR #1374 link in perf-changelog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 68cc38b91..3b4bd812b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2466,4 +2466,4 @@ - "Tune: --gpu-memory-utilization 0.90 -> 0.6, --max-num-seqs 32 -> 128" - "Drop --tool-call-parser deepseek_v4 / --enable-auto-tool-choice (not in recipe; benchmark doesn't exercise tool calling)" - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp4-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 From 15097327203312c8b1d0048ac3eac4227a854d93 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 17:08:56 -0700 Subject: [PATCH 03/14] Update amd-master.yaml --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1935eebe5..424396080 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1626,7 +1626,7 @@ dsv4-fp4-mi355x-sglang: # max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the # sister sglang config (conc 4-64) so vLLM↔SGLang are comparable. dsv4-fp4-mi355x-vllm: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + image: vllm/vllm-openai-rocm:nightly model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x From f0d3c48ee518c9429a8098f7d180a2f4c2cb275d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 17:33:47 -0700 Subject: [PATCH 04/14] Drop atom-image scaffolding from dsv4_fp4_mi355x_vllm.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the image switched to vllm/vllm-openai-rocm:nightly (which already includes vllm-project/vllm#40871 DSv4 base ROCm support), the rebuild overlay and the workarounds that propped up the rocm/atom base image are no longer needed: * Remove the vLLM PR #40889 clone + editable rebuild block * Remove sanitize_stale_triton_test_metadata() (/triton-test was an atom-image metadata quirk; the new /install/torch...whl bug exposed in run 25833728949 stems entirely from this rebuild path, so dropping the rebuild removes both) * Remove ensure_amdsmi_python() — nightly ships the amdsmi Python wheel * Remove install_tilelang_runtime_deps() — only the rebuilt vLLM needed it * Remove patch_vllm_rocm_platform_detection() — nightly detects ROCm correctly without the amdsmi/torch fallback patches * Remove triton_kernels install — only needed by PR #40889's MoE path * Drop VLLM_TARGET_DEVICE / VLLM_PLUGINS env vars (atom-specific) Keep env vars (VLLM_ROCM_USE_AITER, VLLM_ROCM_USE_AITER_LINEAR, VLLM_ENGINE_READY_TIMEOUT_S for the slow cold-cache load), the recipe vllm serve invocation, and the benchmark/eval driver calls. Also refresh the amd-master.yaml comment block above the entry to drop the rebuild references. Script: 539 -> 94 lines. --- .github/configs/amd-master.yaml | 14 +- .../single_node/dsv4_fp4_mi355x_vllm.sh | 468 +----------------- 2 files changed, 17 insertions(+), 465 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 424396080..b99dbbb86 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1611,18 +1611,14 @@ dsv4-fp4-mi355x-sglang: - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 } -# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, -# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with -# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch -# at runtime by benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh at a -# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm -# MI355X image and remove the build step. +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm:nightly +# image (DSv4 base ROCm support is in via vllm-project/vllm#40871). # # DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the # rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Serving flags follow the validated -# recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp executor, -# triton_unfused MoE, async scheduling, max-num-seqs=128, +# and atom DSv4 mi355x entries below. Image and serving flags follow the +# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp +# executor, triton_unfused MoE, async scheduling, max-num-seqs=128, # max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the # sister sglang config (conc 4-64) so vLLM↔SGLang are comparable. dsv4-fp4-mi355x-vllm: diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index 66bf0d7c2..922dbc1c8 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -1,24 +1,17 @@ #!/usr/bin/env bash set -eo pipefail -# DeepSeek-V4-Pro on MI355X via vLLM with AITER MLA decode. -# The DeepSeek-V4-Pro checkpoint is mixed-precision FP4+FP8: MoE expert -# weights are stored in FP4 (the dominant storage class for this -# ~960GB model), with attention/norm/router params in FP8 and KV cache -# in FP8 at runtime. InferenceX classifies this as the fp4 variant. +# DeepSeek-V4-Pro on MI355X via vLLM. +# The DeepSeek-V4-Pro checkpoint is mixed-precision FP4+FP8 (FP4 MoE +# expert weights dominate the ~960 GB footprint, FP8 on attention/norm/ +# router, FP8 KV cache at runtime). InferenceX classifies this as the +# fp4 variant. # -# Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, -# stacked on #40871 which adds base DSv4 ROCm support). -# -# Serving flags follow the validated MI355X recipe from -# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8): AITER + AITER_LINEAR, -# triton_unfused MoE, mp executor, async scheduling, max-num-seqs=128, -# max-num-batched-tokens=8192, gpu-mem-util=0.6. -# -# Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10, -# aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from -# the PR branch on top. Once both PRs merge into a release, switch to -# a vLLM ROCm MI355X image and remove the build. +# Image and serving flags follow the validated MI355X recipe from +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8), which uses the +# official vllm/vllm-openai-rocm:nightly image. DSv4 base ROCm support +# (vllm-project/vllm#40871) is already in that image, so no source +# rebuild is needed. source "$(dirname "$0")/../benchmark_lib.sh" @@ -44,446 +37,9 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_LINEAR=1 -export VLLM_TARGET_DEVICE=rocm +# Loading the ~960 GB checkpoint into KV/weights can exceed the default +# engine-ready timeout on first run from cold HF cache. export VLLM_ENGINE_READY_TIMEOUT_S=3600 -export VLLM_PLUGINS="" - -# Build vLLM from PR #40889 branch (includes #40871 base). The ATOM -# image provides ROCm 7.2.2 toolchain (hipcc, cmake, ninja, torch, -# aiter with MLA decode); we rebuild vLLM in-place. -# Bump VLLM_PR_SHA when the PR moves. -VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" -VLLM_PR_DIR="/tmp/vllm-pr40889" - -sanitize_stale_triton_test_metadata() { - # The ATOM image was built with local /triton-test packages and the final - # layer removed that directory. Pip's resolver follows those metadata refs - # when installing unrelated deps, so remove only the stale metadata lines. - python3 - <<'PY' -import importlib.metadata -import site -import sys -from pathlib import Path - -STALE = "/triton-test" -metadata_files = ("direct_url.json", "METADATA", "requires.txt") -changed = False - -for dist in importlib.metadata.distributions(): - dist_path = Path(str(dist._path)) - name = dist.metadata.get("Name") or dist_path.name - for relpath in metadata_files: - path = dist_path / relpath - if not path.exists(): - continue - text = path.read_text(errors="replace") - if STALE not in text: - continue - changed = True - if relpath == "direct_url.json": - path.unlink() - print(f"Removed stale editable metadata for {name}: {path}") - continue - lines = text.splitlines(keepends=True) - kept = [line for line in lines if STALE not in line] - path.write_text("".join(kept)) - print( - f"Removed {len(lines) - len(kept)} stale {STALE} metadata " - f"line(s) for {name}: {path}" - ) - -for dist in importlib.metadata.distributions(): - dist_path = Path(str(dist._path)) - name = (dist.metadata.get("Name") or dist_path.name).lower().replace("_", "-") - if name != "torch": - continue - for relpath in ("METADATA", "requires.txt"): - path = dist_path / relpath - if not path.exists(): - continue - lines = path.read_text(errors="replace").splitlines(keepends=True) - kept = [] - for line in lines: - normalized = line.strip().lower() - is_triton_req = ( - relpath == "METADATA" - and normalized.startswith("requires-dist: triton") - ) or ( - relpath == "requires.txt" - and normalized.startswith("triton") - ) - if not is_triton_req: - kept.append(line) - if len(kept) == len(lines): - continue - changed = True - path.write_text("".join(kept)) - print( - f"Removed {len(lines) - len(kept)} torch triton dependency " - f"metadata line(s): {path}" - ) - -roots = set() -for getter in (site.getsitepackages,): - try: - roots.update(Path(p) for p in getter()) - except Exception: - pass -try: - roots.add(Path(site.getusersitepackages())) -except Exception: - pass -roots.update(Path(p) for p in sys.path if "site-packages" in p or "dist-packages" in p) - -for root in roots: - if not root.exists(): - continue - for pattern in ("*.egg-link", "*.pth"): - for path in root.glob(pattern): - text = path.read_text(errors="replace") - if STALE not in text: - continue - changed = True - kept = [line for line in text.splitlines(keepends=True) if STALE not in line] - if kept: - path.write_text("".join(kept)) - print(f"Removed stale {STALE} line(s): {path}") - else: - path.unlink() - print(f"Removed stale {STALE} link file: {path}") - -remaining = [] -for dist in importlib.metadata.distributions(): - dist_path = Path(str(dist._path)) - for relpath in metadata_files: - path = dist_path / relpath - if path.exists() and STALE in path.read_text(errors="replace"): - remaining.append(str(path)) -for root in roots: - if root.exists(): - for pattern in ("*.egg-link", "*.pth"): - for path in root.glob(pattern): - if STALE in path.read_text(errors="replace"): - remaining.append(str(path)) - -if remaining: - print("Stale /triton-test metadata remains:") - for path in remaining: - print(f" {path}") - raise SystemExit(1) -if not changed: - print("No stale /triton-test package metadata found.") -PY -} - -ensure_amdsmi_python() { - if python3 - <<'PY' -import amdsmi - -print(f"amdsmi already importable from {amdsmi.__file__}") -PY - then - return - fi - - # ROCm ships the Python binding under /opt/rocm/share/amd_smi. Prefer - # that over PyPI so the Python wrapper matches the image's ROCm runtime. - if [ -d /opt/rocm/share/amd_smi ]; then - if ! python3 -m pip install --no-deps /opt/rocm/share/amd_smi; then - python3 -m pip install --no-deps amdsmi - fi - else - python3 -m pip install --no-deps amdsmi - fi - - python3 - <<'PY' -import amdsmi - -print(f"amdsmi installed from {amdsmi.__file__}") -PY -} - -install_tilelang_runtime_deps() { - # DeepSeek-V4 mHC kernels import tilelang lazily during the vLLM profile - # run. vLLM's ROCm requirements do not include it yet, while the unpinned - # package can fall back to a source build or try to resolve CUDA torch - # dependencies. Use binary wheels only, skip dependency resolution, and - # install the small direct runtime deps we need. TileLang 0.1.9 is required - # for T.pdl_sync used by mhc.py. Do not install torch-c-dlpack-ext on ROCm; - # its wheel expects CUDA libraries. - python3 -m pip install \ - -c /tmp/rocm-pins.txt \ - --no-deps \ - --only-binary=:all: \ - apache-tvm-ffi==0.1.9 \ - z3-solver==4.15.4.0 \ - tilelang==0.1.9 - - python3 - <<'PY' -import tilelang -import tilelang.language as T - -print(f"tilelang {tilelang.__version__} imported from {tilelang.__file__}") -if not hasattr(T, "pdl_sync"): - raise SystemExit("tilelang.language.pdl_sync is required by vLLM mhc.py") -PY -} - -patch_vllm_rocm_platform_detection() { - # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be - # unavailable or return no handles even when PyTorch sees HIP devices. Fall - # back to torch ROCm visibility so current_platform is RocmPlatform. Also - # avoid rocm.py's warning_once path during module import; it imports - # distributed modules while current_platform is still being initialized. - python3 - <<'PY' -from pathlib import Path - -path = Path("vllm/platforms/__init__.py") -text = path.read_text() -start = text.index("def rocm_platform_plugin() -> str | None:") -end = text.index("\n\ndef xpu_platform_plugin() -> str | None:", start) -new = '''def rocm_platform_plugin() -> str | None: - is_rocm = False - logger.debug("Checking if ROCm platform is available.") - try: - import amdsmi - - amdsmi.amdsmi_init() - try: - if len(amdsmi.amdsmi_get_processor_handles()) > 0: - is_rocm = True - logger.debug("Confirmed ROCm platform is available via amdsmi.") - else: - logger.debug("ROCm platform is not available because no GPU is found by amdsmi.") - finally: - amdsmi.amdsmi_shut_down() - except Exception as e: - logger.debug("ROCm platform is not available via amdsmi because: %s", str(e)) - - if not is_rocm: - try: - import torch - - is_rocm = ( - torch.version.hip is not None - and torch.cuda.is_available() - and torch.cuda.device_count() > 0 - ) - if is_rocm: - logger.debug("Confirmed ROCm platform is available via torch HIP.") - else: - logger.debug("ROCm platform is not available via torch HIP.") - except Exception as e: - logger.debug("ROCm torch HIP fallback failed because: %s", str(e)) - - return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None -''' -path.write_text(text[:start] + new + text[end:]) -print(f"Patched ROCm platform detection fallback in {path}") - -path = Path("vllm/platforms/rocm.py") -text = path.read_text() -start = text.index("def _get_gcn_arch() -> str:") -end = text.index("\n\n# Resolve once at module load.", start) -new = '''def _get_gcn_arch() -> str: - """ - Get GCN arch via amdsmi when available, otherwise use torch.cuda. - Avoid warning_once during module import because it can re-enter - vllm.platforms.current_platform initialization. - """ - try: - return _query_gcn_arch_from_amdsmi() - except Exception as e: - logger.debug("Failed to get GCN arch via amdsmi: %s", e) - - try: - props = torch.cuda.get_device_properties(0) - gcn_arch = getattr(props, "gcnArchName", "") - if gcn_arch: - logger.debug("Got GCN arch via torch.cuda: %s", gcn_arch) - return gcn_arch - except Exception as e: - logger.debug("Failed to get GCN arch via torch.cuda: %s", e) - - # This benchmark is MI355X-only. Keep a deterministic fallback instead of - # failing ROCm platform import when amdsmi is absent. - logger.warning("Falling back to gfx950 for MI355X ROCm platform detection.") - return "gfx950" -''' -path.write_text(text[:start] + new + text[end:]) -print(f"Patched ROCm GCN arch fallback in {path}") - -text = path.read_text() - -def replace_block(text: str, start_marker: str, end_marker: str, replacement: str) -> str: - start = text.index(start_marker) - end = text.index(end_marker, start) - return text[:start] + replacement + text[end:] - -text = replace_block( - text, - " @classmethod\n @with_amdsmi_context\n def is_fully_connected", - " @classmethod\n @with_amdsmi_context\n @lru_cache(maxsize=8)\n def get_device_name", - ''' @classmethod - def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: - """ - Query if the set of GPUs are fully connected by XGMI (1 hop). - Fall back to disabling custom allreduce when amdsmi is unavailable. - """ - if "amdsmi_init" not in globals(): - logger.warning( - "amdsmi is unavailable; treating ROCm GPU topology as not " - "fully connected for custom allreduce." - ) - return False - - try: - amdsmi_init() - try: - handles = [ - amdsmi_get_processor_handles()[i] for i in physical_device_ids - ] - for i, handle in enumerate(handles): - for j, peer_handle in enumerate(handles): - if i < j: - link_type = amdsmi_topo_get_link_type( - handle, peer_handle - ) - # type is 2 for XGMI - if link_type["hops"] != 1 or link_type["type"] != 2: - return False - return True - finally: - amdsmi_shut_down() - except Exception as error: - logger.warning( - "AMD 1 hop XGMI detection failed; treating ROCm GPU topology " - "as not fully connected for custom allreduce.", - exc_info=error, - ) - return False - -''', -) - -text = replace_block( - text, - " @classmethod\n @with_amdsmi_context\n @lru_cache(maxsize=8)\n def get_device_name", - " @classmethod\n @with_amdsmi_context\n def get_device_uuid", - ''' @classmethod - @lru_cache(maxsize=8) - def get_device_name(cls, device_id: int = 0) -> str: - if "amdsmi_init" in globals(): - try: - amdsmi_init() - try: - physical_device_id = cls.device_id_to_physical_device_id(device_id) - handle = amdsmi_get_processor_handles()[physical_device_id] - asic_info = amdsmi_get_gpu_asic_info(handle) - asic_info_device_id: str = asic_info["device_id"] - if asic_info_device_id in _ROCM_DEVICE_ID_NAME_MAP: - return _ROCM_DEVICE_ID_NAME_MAP[asic_info_device_id] - return asic_info["market_name"] - finally: - amdsmi_shut_down() - except Exception as error: - logger.debug( - "amdsmi device name query failed; falling back to torch.cuda.", - exc_info=error, - ) - - return torch.cuda.get_device_name(device_id) - -''', -) - -text = replace_block( - text, - " @classmethod\n @with_amdsmi_context\n def get_device_uuid", - " @classmethod\n def get_device_total_memory", - ''' @classmethod - def get_device_uuid(cls, device_id: int = 0) -> str: - if "amdsmi_init" in globals(): - try: - amdsmi_init() - try: - device = amdsmi_get_processor_handles()[device_id] - return amdsmi_get_gpu_device_uuid(device) - finally: - amdsmi_shut_down() - except Exception as error: - logger.debug( - "amdsmi device uuid query failed; falling back to torch.cuda.", - exc_info=error, - ) - - try: - props = torch.cuda.get_device_properties(device_id) - device_uuid = getattr(props, "uuid", None) - if device_uuid: - return str(device_uuid) - except Exception as error: - logger.debug("torch.cuda device uuid fallback failed.", exc_info=error) - return f"cuda:{device_id}" - -''', -) - -path.write_text(text) -print(f"Patched ROCm amdsmi runtime fallbacks in {path}") -PY -} - -check_vllm_rocm_platform_detection() { - VLLM_LOGGING_LEVEL=DEBUG python3 - <<'PY' -import torch -from vllm.platforms import current_platform - -print(f"torch.version.hip={torch.version.hip}") -print(f"torch.cuda.is_available={torch.cuda.is_available()}") -print(f"torch.cuda.device_count={torch.cuda.device_count()}") -print( - "vllm.current_platform=" - f"{current_platform.__class__.__module__}.{current_platform.__class__.__name__} " - f"device_type={current_platform.device_type}" -) -if not current_platform.is_rocm(): - raise SystemExit("vLLM did not detect ROCm platform") -PY -} - -if [ ! -d "$VLLM_PR_DIR/.git" ]; then - git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR" -fi -( - cd "$VLLM_PR_DIR" - git fetch --depth=1 origin "$VLLM_PR_SHA" 2>/dev/null \ - || git fetch --depth=1 origin rocm/aiter-mla-dsv4-decode - git checkout --force "$VLLM_PR_SHA" - test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" - - patch_vllm_rocm_platform_detection - sanitize_stale_triton_test_metadata - ensure_amdsmi_python - - # Pin ROCm packages so pip's resolver can't replace them with - # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.). - pip freeze | grep -iE '^(torch|aiter|triton|mori)' > /tmp/rocm-pins.txt - if grep -n "/triton-test" /tmp/rocm-pins.txt; then - echo "Stale /triton-test reference found in ROCm constraints" - exit 1 - fi - - pip install setuptools-scm - # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) - pip install --no-build-isolation --no-deps --force-reinstall -e . - # Install runtime deps separately, constrained to keep ROCm packages intact. - pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt - install_tilelang_runtime_deps -) - -python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" -check_vllm_rocm_platform_detection SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} From 5899b9e2409de930a6fa9e052a38fa94e794375b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 17:46:41 -0700 Subject: [PATCH 05/14] Drop --moe-backend triton_unfused (not in vLLM main) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recipe (vllm-project/recipes#433) specifies --moe-backend triton_unfused, but that choice was never accepted into vLLM main — likely it lived on the #40871 PR branch and was renamed/removed before merge. In vllm/vllm-openai-rocm:nightly (which the recipe itself uses), the legal choices are: aiter, auto, cutlass, deep_gemm, emulation, flashinfer_cutedsl, flashinfer_cutlass, flashinfer_trtllm, marlin, triton. Drop the flag entirely and let vLLM's `auto` selector pick the backend. With VLLM_ROCM_USE_AITER=1 set, that resolves to the AITER MoE path on ROCm — the same kernel family the recipe was steering toward. All other remaining flags and env vars verified valid in vLLM 0.20.2. --- .github/configs/amd-master.yaml | 8 +++++--- benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b99dbbb86..282af88b9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1618,9 +1618,11 @@ dsv4-fp4-mi355x-sglang: # rest); InferenceX classifies this as fp4 — same as the sister sglang # and atom DSv4 mi355x entries below. Image and serving flags follow the # validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE, async scheduling, max-num-seqs=128, -# max-num-batched-tokens=8192, gpu-mem-util=0.6. Sweep matches the -# sister sglang config (conc 4-64) so vLLM↔SGLang are comparable. +# executor, async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. (Recipe's --moe-backend triton_unfused was never +# accepted into vLLM main, so we let auto pick the AITER MoE path.) Sweep +# matches the sister sglang config (conc 4-64) so vLLM↔SGLang are +# comparable. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:nightly model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index 922dbc1c8..d8d4b9133 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -12,6 +12,12 @@ set -eo pipefail # official vllm/vllm-openai-rocm:nightly image. DSv4 base ROCm support # (vllm-project/vllm#40871) is already in that image, so no source # rebuild is needed. +# +# Note: the recipe specifies --moe-backend triton_unfused, but that +# choice was never accepted into vLLM main (likely added on the #40871 +# PR branch and renamed before merge). Leaving --moe-backend unset so +# vLLM's auto selector picks the right path; with VLLM_ROCM_USE_AITER=1 +# set, that resolves to the AITER MoE backend on ROCm. source "$(dirname "$0")/../benchmark_lib.sh" @@ -63,7 +69,6 @@ vllm serve $MODEL --port $PORT \ --trust-remote-code \ --enforce-eager \ --async-scheduling \ - --moe-backend "triton_unfused" \ --no-enable-prefix-caching \ --tokenizer-mode deepseek_v4 \ --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & From f9791cbb33b44944c3a0b5b76661faecb9d78474 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 18:04:09 -0700 Subject: [PATCH 06/14] Pin nightly-dcacdf9a... to bypass runner squashfs cache staleness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous run errored with: Model architectures ['DeepseekV4ForCausalLM'] are not supported for now. Supported architectures: dict_keys([..., 'DeepseekV32ForCausalLM', ...]) even though vllm-project/vllm#40871 (which registers DeepseekV4ForCausalLM) merged on 2026-05-05 and vllm/vllm-openai-rocm:nightly has been bumped multiple times since. Root cause: runners/launch_mi355x-amds.sh caches enroot squashfs files keyed on the image string and short-circuits re-import if the squash already exists. The runner's cached squash for ':nightly' predates the #40871 merge (the container reported vllm 0.19.2rc1.dev212 ~ Apr 25), so docker hub updates never reached the runner. Switch to an immutable digest-suffixed tag — the squash cache key now changes whenever we bump, forcing a fresh import. Picking nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 (2026-05-13, most recent at time of pin), which is well past the #40871 merge. Also update the script header and yaml comment block to document the caching pitfall so the next bumper doesn't revert to ':nightly'. --- .github/configs/amd-master.yaml | 14 +++++++++++--- benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh | 10 ++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 282af88b9..1e2e1e878 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1611,8 +1611,16 @@ dsv4-fp4-mi355x-sglang: - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 } -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm:nightly -# image (DSv4 base ROCm support is in via vllm-project/vllm#40871). +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm +# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged +# on 2026-05-05, so any nightly built after that includes the +# DeepseekV4ForCausalLM model class. +# +# IMPORTANT: pin to a digest-suffixed nightly tag rather than the +# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs +# files keyed on the image string and short-circuits re-import if the +# file already exists, so the floating tag silently keeps a stale build +# even after Docker Hub updates `:nightly`. # # DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the # rest); InferenceX classifies this as fp4 — same as the sister sglang @@ -1624,7 +1632,7 @@ dsv4-fp4-mi355x-sglang: # matches the sister sglang config (conc 4-64) so vLLM↔SGLang are # comparable. dsv4-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly + image: vllm/vllm-openai-rocm:nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index d8d4b9133..87c59181e 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -8,10 +8,12 @@ set -eo pipefail # fp4 variant. # # Image and serving flags follow the validated MI355X recipe from -# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8), which uses the -# official vllm/vllm-openai-rocm:nightly image. DSv4 base ROCm support -# (vllm-project/vllm#40871) is already in that image, so no source -# rebuild is needed. +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). DSv4 base ROCm +# support (vllm-project/vllm#40871) merged into vLLM main on 2026-05-05, +# so any vllm/vllm-openai-rocm nightly built after that date includes +# the DeepseekV4ForCausalLM model class. The amd-master.yaml entry pins +# a digest-suffixed nightly tag (not the floating :nightly) to bypass +# the runner's squashfs-cache, which otherwise keeps a stale build. # # Note: the recipe specifies --moe-backend triton_unfused, but that # choice was never accepted into vLLM main (likely added on the #40871 From 606c5be274fc2c1651b171d03b3fd64d96540611 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 18:19:03 -0700 Subject: [PATCH 07/14] Restore --moe-backend triton_unfused (required for FP4 expert format) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I dropped --moe-backend triton_unfused based on a stale error message ("invalid choice ... choose from aiter, auto, ...") from the previous run, but that error came from the cached squashfs of an April 25 build that pre-dated #40871. The pinned nightly-dcacdf9a8860a8640 DOES have triton_unfused in MoEBackend — verified by reading vllm/config/kernel.py at that exact commit on GitHub. Without --moe-backend triton_unfused, vLLM's auto selector picks a backend that doesn't register w13_weight_scale / w2_weight_scale on the FP4 expert layers, so safetensors loading throws: KeyError: 'layers.0.ffn.experts.w13_weight_scale' at vllm/model_executor/models/deepseek_v4.py:1492 This matches the recipe (vllm-project/recipes#433) line-for-line now, with the only intentional deviations being InferenceX conventions: * --max-model-len $MAX_MODEL_LEN (sized to ISL+OSL+256) * --no-enable-prefix-caching (fair benchmark comparisons) * VLLM_ENGINE_READY_TIMEOUT_S=3600 (cold HF-cache tolerance) None of those interact with weight loading; they were not implicated in either failure. --- .github/configs/amd-master.yaml | 9 ++++----- benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh | 12 +++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b314a250d..99c3052ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1626,11 +1626,10 @@ dsv4-fp4-mi355x-sglang: # rest); InferenceX classifies this as fp4 — same as the sister sglang # and atom DSv4 mi355x entries below. Image and serving flags follow the # validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. (Recipe's --moe-backend triton_unfused was never -# accepted into vLLM main, so we let auto pick the AITER MoE path.) Sweep -# matches the sister sglang config (conc 4-64) so vLLM↔SGLang are -# comparable. +# executor, triton_unfused MoE (required for the FP4 expert format), +# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. Sweep matches the sister sglang config (conc 4-64) +# so vLLM↔SGLang are comparable. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index 87c59181e..d10ed881e 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -15,11 +15,12 @@ set -eo pipefail # a digest-suffixed nightly tag (not the floating :nightly) to bypass # the runner's squashfs-cache, which otherwise keeps a stale build. # -# Note: the recipe specifies --moe-backend triton_unfused, but that -# choice was never accepted into vLLM main (likely added on the #40871 -# PR branch and renamed before merge). Leaving --moe-backend unset so -# vLLM's auto selector picks the right path; with VLLM_ROCM_USE_AITER=1 -# set, that resolves to the AITER MoE backend on ROCm. +# --moe-backend triton_unfused is required for the FP4 MoE expert +# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend +# default to auto picks a backend that doesn't register the FP4 scale +# parameters (w13_weight_scale / w2_weight_scale), so safetensors +# loading raises KeyError. The choice was added by #40871 alongside the +# model class; the pinned nightly-dcacdf9a includes it. source "$(dirname "$0")/../benchmark_lib.sh" @@ -71,6 +72,7 @@ vllm serve $MODEL --port $PORT \ --trust-remote-code \ --enforce-eager \ --async-scheduling \ + --moe-backend triton_unfused \ --no-enable-prefix-caching \ --tokenizer-mode deepseek_v4 \ --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & From 413df52caf4f63a73713c98726492f01b01d987c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 13 May 2026 18:56:48 -0700 Subject: [PATCH 08/14] Add --quantization deepseek_v4_fp8 to force MXFP4 dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous run errored with: ValueError: moe_backend='triton_unfused' is not supported for FP8 MoE. Expected one of ['triton','deep_gemm','cutlass','flashinfer_trtllm', 'flashinfer_cutlass','marlin','aiter'] even though the DeepSeek-V4-Pro config explicitly declares `expert_dtype: "fp4"`. The cause is vLLM's auto-detection of the DSv4-aware quant config: DeepseekV4FP8Config.override_quantization_method returns "deepseek_v4_fp8" only when: hf_quant_cfg.quant_method in ("fp8","deepseek_v4_fp8") AND (hf_config.model_type == "deepseek_v4" OR user_quant == "deepseek_v4_fp8") The HF config has model_type=deepseek_v4, but the sister SGLang script (dsv4_fp8_mi355x.sh) documents that the bundled transformers in these container images does NOT recognize that model_type and the cached config has to be patched. When the auto-detection silently fails, vLLM falls back to plain Fp8Config, which: * Treats the FusedMoE layer as FP8 block-quantized (registers weight_scale_inv params instead of FP4 w13_weight_scale / w2_weight_scale → KeyError on load_weights — the prior failure) * Routes through select_fp8_moe_backend, which doesn't accept triton_unfused as a valid choice (the current failure) Pass --quantization deepseek_v4_fp8 to take the user_quant branch explicitly and bypass the model_type check entirely. This is the only remaining recipe-vs-runtime deviation needed to make recipes#433 work on this container; document the why in the script header. --- benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index d10ed881e..cdeeaec2a 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -21,6 +21,20 @@ set -eo pipefail # parameters (w13_weight_scale / w2_weight_scale), so safetensors # loading raises KeyError. The choice was added by #40871 alongside the # model class; the pinned nightly-dcacdf9a includes it. +# +# --quantization deepseek_v4_fp8 is required to make vLLM route the +# MoE through the FP4-aware quant config (DeepseekV4FP8Config) and +# honor `expert_dtype: "fp4"` from the checkpoint config. The recipe +# omits this flag because it relies on auto-detection via +# `model_type == "deepseek_v4"`. That auto-path is fragile in our +# container — the SGLang sister script (dsv4_fp8_mi355x.sh) documents +# that the bundled transformers doesn't recognize the deepseek_v4 +# model_type and the cached config has to be patched. Whenever the +# auto-detection silently misses, vLLM falls back to plain Fp8Config, +# which treats MoE as FP8 and rejects triton_unfused. Passing +# --quantization deepseek_v4_fp8 satisfies the explicit-user branch in +# DeepseekV4FP8Config.override_quantization_method and bypasses the +# model_type check entirely. source "$(dirname "$0")/../benchmark_lib.sh" @@ -72,6 +86,7 @@ vllm serve $MODEL --port $PORT \ --trust-remote-code \ --enforce-eager \ --async-scheduling \ + --quantization deepseek_v4_fp8 \ --moe-backend triton_unfused \ --no-enable-prefix-caching \ --tokenizer-mode deepseek_v4 \ From 3c761bf1a92c83359d1010de4bd17781541d0273 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 17 May 2026 20:10:15 -0700 Subject: [PATCH 09/14] Update image --- .github/configs/amd-master.yaml | 2 +- .../single_node/dsv4_fp4_mi355x_vllm.sh | 31 ++++++------------- perf-changelog.yaml | 6 ++++ 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 14766d73e..73a0cfd51 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1770,7 +1770,7 @@ dsv4-fp4-mi355x-sglang: # gpu-mem-util=0.6. Sweep matches the sister sglang config (conc 4-64) # so vLLM↔SGLang are comparable. dsv4-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 + image: vllm/vllm-openai-rocm:nightly-ff712f6447093d07747c88680b9d006b119f5890 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index cdeeaec2a..07b8a15a9 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -7,34 +7,21 @@ set -eo pipefail # router, FP8 KV cache at runtime). InferenceX classifies this as the # fp4 variant. # -# Image and serving flags follow the validated MI355X recipe from -# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). DSv4 base ROCm -# support (vllm-project/vllm#40871) merged into vLLM main on 2026-05-05, -# so any vllm/vllm-openai-rocm nightly built after that date includes -# the DeepseekV4ForCausalLM model class. The amd-master.yaml entry pins -# a digest-suffixed nightly tag (not the floating :nightly) to bypass -# the runner's squashfs-cache, which otherwise keeps a stale build. +# Serving flags follow the validated MI355X recipe from +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). Image-pin details +# live in amd-master.yaml. # # --moe-backend triton_unfused is required for the FP4 MoE expert # weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend # default to auto picks a backend that doesn't register the FP4 scale # parameters (w13_weight_scale / w2_weight_scale), so safetensors -# loading raises KeyError. The choice was added by #40871 alongside the -# model class; the pinned nightly-dcacdf9a includes it. +# loading raises KeyError. # -# --quantization deepseek_v4_fp8 is required to make vLLM route the -# MoE through the FP4-aware quant config (DeepseekV4FP8Config) and -# honor `expert_dtype: "fp4"` from the checkpoint config. The recipe -# omits this flag because it relies on auto-detection via -# `model_type == "deepseek_v4"`. That auto-path is fragile in our -# container — the SGLang sister script (dsv4_fp8_mi355x.sh) documents -# that the bundled transformers doesn't recognize the deepseek_v4 -# model_type and the cached config has to be patched. Whenever the -# auto-detection silently misses, vLLM falls back to plain Fp8Config, -# which treats MoE as FP8 and rejects triton_unfused. Passing -# --quantization deepseek_v4_fp8 satisfies the explicit-user branch in -# DeepseekV4FP8Config.override_quantization_method and bypasses the -# model_type check entirely. +# --quantization deepseek_v4_fp8 forces the FP4-aware +# DeepseekV4FP8Config instead of relying on model_type auto-detection. +# That keeps the mixed-precision checkpoint on the intended MoE path +# and avoids falling back to plain Fp8Config, which rejects +# triton_unfused. source "$(dirname "$0")/../benchmark_lib.sh" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 69b819fbe..f73265698 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2676,3 +2676,9 @@ - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130" - "Temporarily disable agentic-coding scenario (blocked by e2e-tests.yml artifact-name mismatch)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1415 + +- config-keys: + - dsv4-fp4-mi355x-vllm + description: + - "Update vLLM ROCm image from nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 to nightly-ff712f6447093d07747c88680b9d006b119f5890" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 From 7f4874ea1c429315faaff99c27475f228f768850 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 18 May 2026 13:35:44 -0700 Subject: [PATCH 10/14] Update DSv4 MI355X vLLM ROCm nightly image --- .github/configs/amd-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 73a0cfd51..d60b6238d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1770,7 +1770,7 @@ dsv4-fp4-mi355x-sglang: # gpu-mem-util=0.6. Sweep matches the sister sglang config (conc 4-64) # so vLLM↔SGLang are comparable. dsv4-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-ff712f6447093d07747c88680b9d006b119f5890 + image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f73265698..3201a3d14 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2680,5 +2680,5 @@ - config-keys: - dsv4-fp4-mi355x-vllm description: - - "Update vLLM ROCm image from nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 to nightly-ff712f6447093d07747c88680b9d006b119f5890" + - "Update vLLM ROCm image from nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 to nightly-b50646e5effd7cb5884cd96fdff4c53c18521198" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 From ec268e13e6bd3f4be57b097ac8175608f5ee6e39 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 19 May 2026 08:59:00 -0700 Subject: [PATCH 11/14] Add MI355X DSv4 vLLM DEP validation probes --- .github/configs/amd-master.yaml | 6 ++++-- .../single_node/dsv4_fp4_mi355x_vllm.sh | 19 ++++++++++++++++--- perf-changelog.yaml | 6 ++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 43305d60e..23d05b726 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1773,8 +1773,8 @@ dsv4-fp4-mi355x-sglang: # validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp # executor, triton_unfused MoE (required for the FP4 expert format), # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. Sweep matches the sister sglang config (conc 4-64) -# so vLLM↔SGLang are comparable. +# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 +# probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 model: deepseek-ai/DeepSeek-V4-Pro @@ -1789,10 +1789,12 @@ dsv4-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh index 07b8a15a9..2502cfdc9 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh @@ -8,8 +8,9 @@ set -eo pipefail # fp4 variant. # # Serving flags follow the validated MI355X recipe from -# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). Image-pin details -# live in amd-master.yaml. +# vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8). DEP probes reuse the +# same ROCm recipe while switching parallelism to vLLM's DP+EP form. +# Image-pin details live in amd-master.yaml. # # --moe-backend triton_unfused is required for the FP4 MoE expert # weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend @@ -28,6 +29,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -61,9 +63,20 @@ fi start_gpu_monitor +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + set -x vllm serve $MODEL --port $PORT \ - --tensor-parallel-size $TP \ + "${PARALLEL_ARGS[@]}" \ + "${EP_ARGS[@]}" \ --distributed-executor-backend mp \ --gpu-memory-utilization 0.6 \ --max-model-len $MAX_MODEL_LEN \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dba453951..c29c8669f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2957,3 +2957,9 @@ description: - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 + +- config-keys: + - dsv4-fp4-mi355x-vllm + description: + - "Add DEP8 dp-attn=true validation probes at conc=64 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 From 5ed629e8e5402e79c4a2ee37d7db25996b160cc2 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 19 May 2026 09:18:40 -0700 Subject: [PATCH 12/14] disable DPA --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 23d05b726..b5056e6b5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1789,12 +1789,12 @@ dsv4-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks From ac04da00630f0fc1cc127ac62eae9783c89ed6ac Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 19 May 2026 09:27:28 -0700 Subject: [PATCH 13/14] disable EP --- .github/configs/amd-master.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b5056e6b5..efc9c3471 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1788,13 +1788,11 @@ dsv4-fp4-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 128 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks From 7c2f1f4aaabb0d5d8851892cb0e4b4fa2b0f119c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 19 May 2026 13:10:29 -0700 Subject: [PATCH 14/14] final --- perf-changelog.yaml | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c29c8669f..562bda078 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2473,15 +2473,6 @@ - "Add DeepSeek-V4-Pro FP4 MI355X ATOM benchmark config; bump image to rocm/atom-dev:nightly_202605101539, expand concurrency range (conc 4–1024), and simplify runtime script" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1311 -- config-keys: - - dsv4-fp4-mi355x-vllm - description: - - "Adopt validated MI355X serving recipe from vllm-project/recipes#433 (DeepSeek-V4-Pro, TP=8)" - - "Add env: VLLM_ROCM_USE_AITER_LINEAR=1 (alongside existing VLLM_ROCM_USE_AITER=1)" - - "Add server flags: --distributed-executor-backend mp, --max-num-batched-tokens 8192, --async-scheduling" - - "Expand search space from conc=1 to conc 4-64 to match dsv4-fp4-mi355x-sglang for vLLM<->SGLang comparability now that max-num-seqs=128 supports it" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 - - config-keys: - glm5-fp8-mi355x-sglang description: @@ -2685,12 +2676,6 @@ - "Temporarily disable agentic-coding scenario (blocked by e2e-tests.yml artifact-name mismatch)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1415 -- config-keys: - - dsv4-fp4-mi355x-vllm - description: - - "Update vLLM ROCm image from nightly-dcacdf9a8860a86401127d1c8f93ebf3cfbfd026 to nightly-b50646e5effd7cb5884cd96fdff4c53c18521198" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 - - config-keys: - qwen3.5-bf16-b200-sglang - qwen3.5-bf16-b200-sglang-mtp @@ -2961,5 +2946,6 @@ - config-keys: - dsv4-fp4-mi355x-vllm description: + - "Following recipe from https://github.com/vllm-project/recipes/pull/433" - "Add DEP8 dp-attn=true validation probes at conc=64 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374