diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 418ad5ab9..96d70c7fa 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1350,8 +1350,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" + dsr1-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1540,6 +1541,25 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" + # 1*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" conc-list: [ 1024, 2048, 4096 ] @@ -1560,7 +1580,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1730,24 +1750,43 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=2" - # 1P2D TP4 + # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] + conc-list: [ 128, 512 ] prefill: num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 1 - dp-attn: false + ep: 8 + dp-attn: true additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 64, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index d0b99eddc..904576003 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -40,7 +40,8 @@ export NCCL_IB_HCA=$IBDEVICES export SGLANG_USE_AITER=1 export SGLANG_MORI_DISPATCH_DTYPE=auto -export SGLANG_MORI_FP8_COMB=true +export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast +export MORI_COMBINE_DTYPE_DECODE=fp8 export SGLANG_MORI_QP_PER_TRANSFER=4 export SGLANG_MORI_NUM_WORKERS=4 export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 @@ -59,7 +60,7 @@ export MORI_SHMEM_MODE=ISOLATION # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 export SGLANG_LOG_MS=true export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index bbe8de6aa..7eb7414a6 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +# Extract the maximum concurrency from the x-delimited list +BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + # Dry Run for debugging purpose DRY_RUN="${DRY_RUN:-0}" @@ -184,6 +187,15 @@ else prefill_enable_two_batch_overlap="false" fi +# When both DP and EP are enabled, override max-running-requests with max bench concurrency +if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then + prefill_max_running_requests=$BENCH_MAX_CONC_VALUE + prefill_dp_ranks=$PREFILL_TP_SIZE + # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) + MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" +fi + # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) if [[ "$DECODE_ENABLE_DP" == "true" ]]; then decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) @@ -196,6 +208,18 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi +# When both DP and EP are enabled, override max-running-requests and dispatch tokens +if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_max_running_requests=$BENCH_MAX_CONC_VALUE + decode_dp_ranks=$DECODE_TP_SIZE + MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # Update derived variable + SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD + echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" +fi + # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then @@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]] DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL unset MORI_MOE_MAX_INPUT_TOKENS_DECODE - # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness - # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of - # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD - # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising - # that an fast follow PR to fix the evals via having quant correction in the fp8 combine fi # ============================================================================= @@ -398,7 +417,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -630,7 +649,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -698,7 +717,7 @@ else DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 55e1d1243..f34b5741f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2449,6 +2449,15 @@ - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1346 +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Fix the eval result of dsr1 fp4 with fp8 blockwise combine" + - "Bump the image to May 19" + - "Add conc 512 new sweep point" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1566 + - config-keys: - kimik2.5-int4-h200-vllm description: