From 12c9d3c7013d059a7e53e97366a6e8757d81216d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 20 May 2026 16:29:31 +0800 Subject: [PATCH] Restore dpskv4 GB300 non-MTP disagg to staging image + deepep backend --- .github/configs/nvidia-master.yaml | 2 +- ...isagg-gb300-10p1d-dep4-dep16-14-c8192.yaml | 40 +++++++++++++------ ...sagg-gb300-12p1d-dep4-dep12-15-c21504.yaml | 40 +++++++++++++------ .../disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml | 40 +++++++++++++------ .../8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml | 26 ++++++------ .../disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml | 40 +++++++++++++------ ...disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml | 40 +++++++++++++------ perf-changelog.yaml | 5 +++ 8 files changed, 160 insertions(+), 73 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 43b8be880..2d67aa29b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8759,7 +8759,7 @@ dsv4-fp4-gb300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647 + image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml index eca9a5d7d..1bcd793c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-10p1d-dep4-dep16-14-c8192" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" dynamo: @@ -74,14 +74,24 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -94,17 +104,24 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -119,7 +136,7 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -135,7 +152,7 @@ backend: expert-parallel-size: 4 enable-dp-attention: true - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' moe-dense-tp-size: 1 @@ -154,7 +171,7 @@ backend: stream-interval: 60 load-balance-method: "total_requests" - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake @@ -179,4 +196,3 @@ benchmark: concurrencies: "8192" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml index 70f791e78..0fbab8d77 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-12p1d-dep4-dep12-15-c21504" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" dynamo: @@ -74,14 +74,24 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -94,17 +104,24 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -119,7 +136,7 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -135,7 +152,7 @@ backend: expert-parallel-size: 4 enable-dp-attention: true - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' moe-dense-tp-size: 1 @@ -154,7 +171,7 @@ backend: stream-interval: 60 load-balance-method: "total_requests" - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake @@ -179,4 +196,3 @@ benchmark: concurrencies: "21504" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml index 49cc976ee..99bad72bc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-dep4-dep16-5-c1024" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" dynamo: @@ -74,14 +74,24 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -94,17 +104,24 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -119,7 +136,7 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -135,7 +152,7 @@ backend: expert-parallel-size: 4 enable-dp-attention: true - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' moe-dense-tp-size: 1 @@ -154,7 +171,7 @@ backend: stream-interval: 60 load-balance-method: "total_requests" - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake @@ -179,4 +196,3 @@ benchmark: concurrencies: "1024" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index a7c1ed0e9..1e6d8cc37 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-tp4-tp4-2-c1" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" # See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin @@ -80,12 +80,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -93,16 +95,17 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -110,7 +113,7 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -164,4 +167,3 @@ benchmark: concurrencies: "1" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml index 56c3f432d..2a7cf4d28 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-4p1d-dep4-dep16-8-c1024" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" dynamo: @@ -74,14 +74,24 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -94,17 +104,24 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -119,7 +136,7 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -135,7 +152,7 @@ backend: expert-parallel-size: 4 enable-dp-attention: true - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' moe-dense-tp-size: 1 @@ -154,7 +171,7 @@ backend: stream-interval: 60 load-balance-method: "total_requests" - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake @@ -179,4 +196,3 @@ benchmark: concurrencies: "1024" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml index eee88f200..cf7061eca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-8p1d-dep4-dep16-12-c4096" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "fp4" dynamo: @@ -74,14 +74,24 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_OPT_FP8_WO_A_GEMM: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -94,17 +104,24 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_OPT_USE_ONLINE_COMPRESS: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -119,7 +136,7 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 # is single-node only and corrupts results in 2-node decode setups. sglang_config: @@ -135,7 +152,7 @@ backend: expert-parallel-size: 4 enable-dp-attention: true - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' moe-dense-tp-size: 1 @@ -154,7 +171,7 @@ backend: stream-interval: 60 load-balance-method: "total_requests" - moe-a2a-backend: "megamoe" + moe-a2a-backend: "deepep" disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake @@ -179,4 +196,3 @@ benchmark: concurrencies: "4096" req_rate: "inf" use_chat_template: false - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c62d94781..3a626a695 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3043,3 +3043,8 @@ description: - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Restore `sglang-staging:deepseek-v4-grace-blackwell-dev` image, `deepep` moe-a2a-backend, and previous env vars for all non-MTP disagg configs"