Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1999,26 +1999,22 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
# DeepSeek-V4-Pro on B300 with sglang (non-MTP).
# Uses nightly image with megamoe backend for high-concurrency profiles.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh
# by CONC:
# CONC 1|32: TP-only, flashinfer_mxfp4
# CONC 512: DP-attn, flashinfer_mxfp4
# CONC 2048-8192: DP-attn, megamoe
# ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
scenarios:
fixed-seq-len:
Expand All @@ -2027,14 +2023,14 @@ dsv4-fp4-b300-sglang:
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }

Expand Down
201 changes: 89 additions & 112 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,9 @@ fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
# ─── Common env vars (all profiles) ───────────────────────────────────────────
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for these images so the
# editable install stays visible. Paths in this script are $PWD-relative for
# that reason. Drop the runner conditional once lmsys moves sglang back out of
# /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}
Expand All @@ -52,114 +41,101 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
# ─── Per-concurrency launch profile ──────────────────────────────────────────
# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
#
# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
if [[ "$ISL" == "1024" ]]; then
SWA_FULL_TOKENS_RATIO=0.5
else
SWA_FULL_TOKENS_RATIO=0.1
fi

# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
# script's pattern). DP-attention runs the empirically-tuned high-concurrency
# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# Default; the DP-attn branch below overrides to 0.94.
MEM_FRACTION_STATIC=0.90
if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
# TP-only, no DP attention
MEM_FRACTION_STATIC=0.90
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
)

if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$CONC" = "512" ]; then
# DP attention, flashinfer_mxfp4
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
# ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
# recipes first (they also have ep=8) so they aren't shadowed by the
# medium-conc EP_SIZE=8 branch below.
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
if [ "$CONC" = "2048" ]; then
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=288
MAX_RUNNING_REQUESTS=2560
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
TOKENIZER_WORKER_NUM=4
elif [ "$CONC" = "4096" ]; then
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=544
MAX_RUNNING_REQUESTS=4352
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
TOKENIZER_WORKER_NUM=8
else
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
CUDA_GRAPH_MAX_BS=1088
MAX_RUNNING_REQUESTS=8192
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
TOKENIZER_WORKER_NUM=16
fi
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 65536
--tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
--enable-prefill-delayer
)
if [ "$CONC" = "4096" ]; then
PARALLEL_ARGS+=(--decode-log-interval 5)
fi
if [ "$CONC" = "8192" ]; then
PARALLEL_ARGS+=(--stream-interval 30)
fi
elif [ "${EP_SIZE}" = "8" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs 550
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MAX_RUNNING_REQUESTS=768
MEM_FRACTION_STATIC=0.94
else
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
fi
else
MEM_FRACTION_STATIC=0.94
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
--chunked-prefill-size 16384
--enable-prefill-delayer
)

elif [ "$CONC" = "2048" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
MAX_RUNNING_REQUESTS=2560
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 288
--chunked-prefill-size 65536
--tokenizer-worker-num 4
--enable-prefill-delayer
)

elif [ "$CONC" = "4096" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
MAX_RUNNING_REQUESTS=4352
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 544
--chunked-prefill-size 65536
--tokenizer-worker-num 8
--enable-prefill-delayer
--decode-log-interval 5
)

elif [ "$CONC" = "8192" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
MAX_RUNNING_REQUESTS=8192
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 1088
--chunked-prefill-size 65536
--tokenizer-worker-num 16
--enable-prefill-delayer
--stream-interval 30
)

else
echo "ERROR: unsupported CONC=$CONC" >&2
exit 1
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
Expand Down Expand Up @@ -187,6 +163,7 @@ SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas
pip install -q --upgrade transformers

run_benchmark_serving \
--model "$MODEL" \
Expand Down
13 changes: 13 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3062,3 +3062,16 @@
description:
- "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Update sglang image from deepseek-v4-b300 to nightly-dev-cu13-20260520-425dffbd"
- "Refactor benchmark script to dispatch by CONC instead of nested DP_ATTENTION/CONC/EP_SIZE"
- "Switch CONC 2048/4096/8192 from --moe-a2a-backend deepep to megamoe"
- "Remove env vars deleted from sglang main (SGLANG_OPT_USE_JIT_NORM, SGLANG_OPT_USE_FAST_MASK_EP, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE)"
- "Remove env vars redundant with sglang defaults (SGLANG_OPT_USE_JIT_INDEXER_METADATA, SGLANG_OPT_USE_TOPK_V2, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)"
- "Remove env vars auto-set by megamoe backend (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY)"
- "Remove --deepep-config and SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK (unused by megamoe/StandardDispatcher)"
- "Fix CONC=512 yaml ep from 4 to 1 (flashinfer_mxfp4 does not set ep=tp)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1506
Loading