diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 43b8be880..4a683d372 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8759,7 +8759,7 @@ dsv4-fp4-gb300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647 + image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml index eca9a5d7d..bfd5f6ca3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-10p1d-dep4-dep16-14-c8192" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" dynamo: @@ -94,7 +94,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -119,7 +118,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml index 70f791e78..7978a5fde 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-12p1d-dep4-dep12-15-c21504" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" dynamo: @@ -94,7 +94,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -119,7 +118,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml index 49cc976ee..a4f55ca71 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-dep4-dep16-5-c1024" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" dynamo: @@ -94,7 +94,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -119,7 +118,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index a7c1ed0e9..52848627f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-1p1d-tp4-tp4-2-c1" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" # See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin @@ -93,7 +93,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -110,7 +109,6 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml index 56c3f432d..becb97bfb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-4p1d-dep4-dep16-8-c1024" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" dynamo: @@ -94,7 +94,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -119,7 +118,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml index eee88f200..2a35dea5b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml @@ -33,7 +33,7 @@ name: "disagg-gb300-8p1d-dep4-dep16-12-c4096" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" precision: "fp4" dynamo: @@ -94,7 +94,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" decode_environment: PYTHONUNBUFFERED: "1" @@ -119,7 +118,6 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - SGLANG_OPT_FP8_WO_A_GEMM: "0" # is single-node only and corrupts results in 2-node decode setups. sglang_config: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c62d94781..9c4894c47 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3035,7 +3035,14 @@ - "Bump ATOM image to rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511" - "TP=4 shows +3.2% to +16.3% throughput improvement across 1k1k and 8k1k workloads (concurrency 4-256)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1411 - + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Update SGLang image from nightly-dev-cu13-20260519-dbac4647 to nightly-dev-cu13-20260520-425dffbd for all non-MTP disagg configs" + - "Remove SGLANG_OPT_FP8_WO_A_GEMM=0 workaround (topk_v2 crash fixed upstream in sgl-project/sglang#25805)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1528 + - config-keys: - qwen3.5-fp4-b300-sglang