diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..7955aba4e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1999,26 +1999,22 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# lists B200 (not B300) as the Blackwell target. This config reuses the -# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 -# until a B300-specific recipe ships. Prefix caching is disabled. -# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. +# DeepSeek-V4-Pro on B300 with sglang (non-MTP). +# Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 + image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC: - # low-latency (CONC <= 32): TP-only - # balanced (32 < CONC <= 128): + DP-attn - # max-throughput (CONC > 128): + DP-attn - # Split so result filenames (ep=, dpa=) accurately reflect the recipe. - # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh + # by CONC: + # CONC 1|32: TP-only, flashinfer_mxfp4 + # CONC 512: DP-attn, flashinfer_mxfp4 + # CONC 2048-8192: DP-attn, megamoe + # ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. scenarios: fixed-seq-len: @@ -2027,14 +2023,14 @@ dsv4-fp4-b300-sglang: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 8f43ea8a3..5fb8499d2 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -24,20 +24,9 @@ fi nvidia-smi -# Common SGLANG env vars (apply to every config). +# ─── Common env vars (all profiles) ─────────────────────────────────────────── export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - -# TODO(Cam): the deepseek-v4 sglang images install sglang editable at -# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. -# The runner mounts our repo at a non-/workspace path for these images so the -# editable install stays visible. Paths in this script are $PWD-relative for -# that reason. Drop the runner conditional once lmsys moves sglang back out of -# /workspace. SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} @@ -52,114 +41,101 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was +# ─── Per-concurrency launch profile ────────────────────────────────────────── +# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, +# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. +# +# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. -if [[ "$ISL" == "1024" ]]; then - SWA_FULL_TOKENS_RATIO=0.5 -else - SWA_FULL_TOKENS_RATIO=0.1 -fi - -# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm -# script's pattern). DP-attention runs the empirically-tuned high-concurrency -# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); -# single-instance uses flashinfer_mxfp4 with the cookbook defaults. -DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# Default; the DP-attn branch below overrides to 0.94. -MEM_FRACTION_STATIC=0.90 +if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then + # TP-only, no DP attention + MEM_FRACTION_STATIC=0.90 + SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune + ) -if [ "${DP_ATTENTION}" = "true" ]; then +elif [ "$CONC" = "512" ]; then + # DP attention, flashinfer_mxfp4 export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc - # recipes first (they also have ep=8) so they aren't shadowed by the - # medium-conc EP_SIZE=8 branch below. - if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - if [ "$CONC" = "2048" ]; then - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 - CUDA_GRAPH_MAX_BS=288 - MAX_RUNNING_REQUESTS=2560 - MEM_FRACTION_STATIC=0.87 - SWA_FULL_TOKENS_RATIO=0.06 - TOKENIZER_WORKER_NUM=4 - elif [ "$CONC" = "4096" ]; then - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 - CUDA_GRAPH_MAX_BS=544 - MAX_RUNNING_REQUESTS=4352 - MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.075 - TOKENIZER_WORKER_NUM=8 - else - export SGLANG_OPT_USE_ONLINE_COMPRESS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 - CUDA_GRAPH_MAX_BS=1088 - MAX_RUNNING_REQUESTS=8192 - MEM_FRACTION_STATIC=0.80 - SWA_FULL_TOKENS_RATIO=0.3 - TOKENIZER_WORKER_NUM=16 - fi - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 65536 - --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" - --enable-prefill-delayer - ) - if [ "$CONC" = "4096" ]; then - PARALLEL_ARGS+=(--decode-log-interval 5) - fi - if [ "$CONC" = "8192" ]; then - PARALLEL_ARGS+=(--stream-interval 30) - fi - elif [ "${EP_SIZE}" = "8" ]; then - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs 550 - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MAX_RUNNING_REQUESTS=768 - MEM_FRACTION_STATIC=0.94 - else - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 - fi -else + MEM_FRACTION_STATIC=0.94 + SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 --disable-flashinfer-autotune + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + +elif [ "$CONC" = "2048" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + MEM_FRACTION_STATIC=0.87 + SWA_FULL_TOKENS_RATIO=0.06 + MAX_RUNNING_REQUESTS=2560 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 288 + --chunked-prefill-size 65536 + --tokenizer-worker-num 4 + --enable-prefill-delayer + ) + +elif [ "$CONC" = "4096" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + MEM_FRACTION_STATIC=0.835 + SWA_FULL_TOKENS_RATIO=0.075 + MAX_RUNNING_REQUESTS=4352 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 544 + --chunked-prefill-size 65536 + --tokenizer-worker-num 8 + --enable-prefill-delayer + --decode-log-interval 5 ) + +elif [ "$CONC" = "8192" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 + MEM_FRACTION_STATIC=0.80 + SWA_FULL_TOKENS_RATIO=0.3 + MAX_RUNNING_REQUESTS=8192 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 1088 + --chunked-prefill-size 65536 + --tokenizer-worker-num 16 + --enable-prefill-delayer + --stream-interval 30 + ) + +else + echo "ERROR: unsupported CONC=$CONC" >&2 + exit 1 fi # Print all SGLANG_* env vars to both the CI step log and server.log so the @@ -187,6 +163,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" pip install -q datasets pandas +pip install -q --upgrade transformers run_benchmark_serving \ --model "$MODEL" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 47eba24e1..9de972440 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3062,3 +3062,16 @@ description: - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Update sglang image from deepseek-v4-b300 to nightly-dev-cu13-20260520-425dffbd" + - "Refactor benchmark script to dispatch by CONC instead of nested DP_ATTENTION/CONC/EP_SIZE" + - "Switch CONC 2048/4096/8192 from --moe-a2a-backend deepep to megamoe" + - "Remove env vars deleted from sglang main (SGLANG_OPT_USE_JIT_NORM, SGLANG_OPT_USE_FAST_MASK_EP, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE)" + - "Remove env vars redundant with sglang defaults (SGLANG_OPT_USE_JIT_INDEXER_METADATA, SGLANG_OPT_USE_TOPK_V2, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)" + - "Remove env vars auto-set by megamoe backend (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY)" + - "Remove --deepep-config and SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK (unused by megamoe/StandardDispatcher)" + - "Fix CONC=512 yaml ep from 4 to 1 (flashinfer_mxfp4 does not set ep=tp)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1506