SemiAnalysisAI · yhyang201 · May 18, 2026 · May 18, 2026 · May 19, 2026
@@ -1999,26 +1999,22 @@ dsr1-fp8-b300-sglang:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
 
-# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# lists B200 (not B300) as the Blackwell target. This config reuses the
-# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
-# until a B300-specific recipe ships. Prefix caching is disabled.
-# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
+# DeepSeek-V4-Pro on B300 with sglang (non-MTP).
+# Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
+  image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
   framework: sglang
   multinode: false
-  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
-  #   low-latency    (CONC <= 32):       TP-only
-  #   balanced       (32 < CONC <= 128): + DP-attn
-  #   max-throughput (CONC > 128):       + DP-attn
-  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
-  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+  # Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+  # by CONC:
+  #   CONC 1|32:         TP-only, flashinfer_mxfp4
+  #   CONC 512:          DP-attn, flashinfer_mxfp4
+  #   CONC 2048-8192:    DP-attn, megamoe
+  # ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size,
   # while low-latency leaves ep_size at the default of 1.
   scenarios:
     fixed-seq-len:
@@ -2027,14 +2023,14 @@ dsv4-fp4-b300-sglang:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+      - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+      - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -24,20 +24,9 @@ fi
 
 nvidia-smi
 
-# Common SGLANG env vars (apply to every config).
+# ─── Common env vars (all profiles) ───────────────────────────────────────────
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-export SGLANG_OPT_USE_JIT_NORM=1
-export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-export SGLANG_OPT_USE_TOPK_V2=1
-export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
-
-# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
-# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
-# The runner mounts our repo at a non-/workspace path for these images so the
-# editable install stays visible. Paths in this script are $PWD-relative for
-# that reason. Drop the runner conditional once lmsys moves sglang back out of
-# /workspace.
 
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
@@ -52,114 +41,101 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
+# ─── Per-concurrency launch profile ──────────────────────────────────────────
+# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
+# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
+#
+# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
 # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
-if [[ "$ISL" == "1024" ]]; then
-    SWA_FULL_TOKENS_RATIO=0.5
-else
-    SWA_FULL_TOKENS_RATIO=0.1
-fi
-
-# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
-# script's pattern). DP-attention runs the empirically-tuned high-concurrency
-# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
-# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
-DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# Default; the DP-attn branch below overrides to 0.94.
-MEM_FRACTION_STATIC=0.90
+if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
+    # TP-only, no DP attention
+    MEM_FRACTION_STATIC=0.90
+    SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
+    PARALLEL_ARGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 8192
+        --disable-flashinfer-autotune
+    )
 
-if [ "${DP_ATTENTION}" = "true" ]; then
+elif [ "$CONC" = "512" ]; then
+    # DP attention, flashinfer_mxfp4
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
-    # recipes first (they also have ep=8) so they aren't shadowed by the
-    # medium-conc EP_SIZE=8 branch below.
-    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
-        export NVSHMEM_DISABLE_IB=1
-        export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        if [ "$CONC" = "2048" ]; then
-            export SGLANG_LOG_FORWARD_ITERS=1
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-            CUDA_GRAPH_MAX_BS=288
-            MAX_RUNNING_REQUESTS=2560
-            MEM_FRACTION_STATIC=0.87
-            SWA_FULL_TOKENS_RATIO=0.06
-            TOKENIZER_WORKER_NUM=4
-        elif [ "$CONC" = "4096" ]; then
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-            CUDA_GRAPH_MAX_BS=544
-            MAX_RUNNING_REQUESTS=4352
-            MEM_FRACTION_STATIC=0.835
-            SWA_FULL_TOKENS_RATIO=0.075
-            TOKENIZER_WORKER_NUM=8
-        else
-            export SGLANG_OPT_USE_ONLINE_COMPRESS=1
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
-            CUDA_GRAPH_MAX_BS=1088
-            MAX_RUNNING_REQUESTS=8192
-            MEM_FRACTION_STATIC=0.80
-            SWA_FULL_TOKENS_RATIO=0.3
-            TOKENIZER_WORKER_NUM=16
-        fi
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-a2a-backend deepep
-            --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 65536
-            --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
-            --enable-prefill-delayer
-        )
-        if [ "$CONC" = "4096" ]; then
-            PARALLEL_ARGS+=(--decode-log-interval 5)
-        fi
-        if [ "$CONC" = "8192" ]; then
-            PARALLEL_ARGS+=(--stream-interval 30)
-        fi
-    elif [ "${EP_SIZE}" = "8" ]; then
-        export NVSHMEM_DISABLE_IB=1
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-a2a-backend deepep
-            --cuda-graph-max-bs 550
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 16384
-            --enable-prefill-delayer
-        )
-        MAX_RUNNING_REQUESTS=768
-        MEM_FRACTION_STATIC=0.94
-    else
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-runner-backend flashinfer_mxfp4
-            --disable-flashinfer-autotune
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 16384
-            --enable-prefill-delayer
-        )
-        MEM_FRACTION_STATIC=0.94
-    fi
-else
+    MEM_FRACTION_STATIC=0.94
+    SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
     PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
         --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 8192
         --disable-flashinfer-autotune
+        --chunked-prefill-size 16384
+        --enable-prefill-delayer
+    )
+
+elif [ "$CONC" = "2048" ]; then
+    # DP attention, megamoe
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export NVSHMEM_DISABLE_IB=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_LOG_FORWARD_ITERS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
+    MEM_FRACTION_STATIC=0.87
+    SWA_FULL_TOKENS_RATIO=0.06
+    MAX_RUNNING_REQUESTS=2560
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend megamoe
+        --cuda-graph-max-bs 288
+        --chunked-prefill-size 65536
+        --tokenizer-worker-num 4
+        --enable-prefill-delayer
+    )
+
+elif [ "$CONC" = "4096" ]; then
+    # DP attention, megamoe
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export NVSHMEM_DISABLE_IB=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
+    MEM_FRACTION_STATIC=0.835
+    SWA_FULL_TOKENS_RATIO=0.075
+    MAX_RUNNING_REQUESTS=4352
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend megamoe
+        --cuda-graph-max-bs 544
+        --chunked-prefill-size 65536
+        --tokenizer-worker-num 8
+        --enable-prefill-delayer
+        --decode-log-interval 5
     )
+
+elif [ "$CONC" = "8192" ]; then
+    # DP attention, megamoe
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export NVSHMEM_DISABLE_IB=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
+    MEM_FRACTION_STATIC=0.80
+    SWA_FULL_TOKENS_RATIO=0.3
+    MAX_RUNNING_REQUESTS=8192
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend megamoe
+        --cuda-graph-max-bs 1088
+        --chunked-prefill-size 65536
+        --tokenizer-worker-num 16
+        --enable-prefill-delayer
+        --stream-interval 30
+    )
+
+else
+    echo "ERROR: unsupported CONC=$CONC" >&2
+    exit 1
 fi
 
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
@@ -187,6 +163,7 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 pip install -q datasets pandas
+pip install -q --upgrade transformers
 
 run_benchmark_serving \
     --model "$MODEL" \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3062,3 +3062,16 @@
   description:
     - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Update sglang image from deepseek-v4-b300 to nightly-dev-cu13-20260520-425dffbd"
+    - "Refactor benchmark script to dispatch by CONC instead of nested DP_ATTENTION/CONC/EP_SIZE"
+    - "Switch CONC 2048/4096/8192 from --moe-a2a-backend deepep to megamoe"
+    - "Remove env vars deleted from sglang main (SGLANG_OPT_USE_JIT_NORM, SGLANG_OPT_USE_FAST_MASK_EP, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE)"
+    - "Remove env vars redundant with sglang defaults (SGLANG_OPT_USE_JIT_INDEXER_METADATA, SGLANG_OPT_USE_TOPK_V2, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)"
+    - "Remove env vars auto-set by megamoe backend (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY)"
+    - "Remove --deepep-config and SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK (unused by megamoe/StandardDispatcher)"
+    - "Fix CONC=512 yaml ep from 4 to 1 (flashinfer_mxfp4 does not set ep=tp)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1506