diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index accb5e56f..0efd8c224 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -31,14 +31,10 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -49,14 +45,10 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 3e8fca87b..b502ec587 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -31,24 +31,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -60,14 +50,10 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -95,8 +81,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" mem-fraction-static: 0.9 max-running-requests: 128 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 6745aa37e..5f4b21583 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -33,24 +33,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -62,23 +52,13 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -106,8 +86,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" mem-fraction-static: 0.9 max-running-requests: 256 @@ -131,8 +110,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" speculative-algo: "EAGLE" speculative-num-steps: 3 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 842fbb556..84c4d6443 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -33,24 +33,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -62,23 +52,13 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -106,8 +86,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" mem-fraction-static: 0.9 max-running-requests: 256 @@ -131,8 +110,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" speculative-algo: "EAGLE" speculative-num-steps: 3 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 49b0d31c3..ea64f91ec 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -33,24 +33,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -62,23 +52,13 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -106,8 +86,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" mem-fraction-static: 0.9 max-running-requests: 512 @@ -131,8 +110,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" speculative-algo: "EAGLE" speculative-num-steps: 3 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index d907c369e..ad6c54b0e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-20260522-c9153da5" precision: "mxfp4" sbatch_directives: @@ -33,24 +33,14 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -62,23 +52,13 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" @@ -106,8 +86,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" mem-fraction-static: 0.9 max-running-requests: 1024 @@ -131,8 +110,7 @@ backend: enable-dp-attention: true enable-dp-lm-head: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + moe-a2a-backend: "megamoe" speculative-algo: "EAGLE" speculative-num-steps: 3 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f34b5741f..947190413 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3139,6 +3139,13 @@ - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp + description: + - "Update SGLang image from nightly-dev-cu13-20260510-2473659e to nightly-dev-20260522-c9153da5" + - "Clean up obsolete environs in the 8k1k disagg recipes: drop SGLANG_OPT_USE_JIT_NORM / SGLANG_OPT_USE_JIT_INDEXER_METADATA / SGLANG_OPT_USE_TOPK_V2 (now default-on); drop the auto-set MegaMoE companions (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK); drop SGLANG_RADIX_DISABLE_REUSE / SGLANG_OPT_USE_FAST_MASK_EP which no longer exist in sglang environ.py" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1559 + - config-keys: - minimaxm2.5-fp8-h200-vllm description: