Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 51 additions & 12 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1350,8 +1350,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"


dsr1-fp4-mi355x-sglang-disagg:
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
Expand Down Expand Up @@ -1540,6 +1541,25 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# 1*DEP8 + 1*DEP8
- spec-decoding: "none"
conc-list: [ 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

# 2*DEP8 + 1*DEP8
- spec-decoding: "none"
conc-list: [ 1024, 2048, 4096 ]
Expand All @@ -1560,7 +1580,7 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_MTP_SIZE=0"

dsr1-fp4-mi355x-sglang-disagg-mtp:
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
Expand Down Expand Up @@ -1730,24 +1750,43 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"

# 1P2D TP4
# 1*DEP8 + 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 64, 128, 256 ]
conc-list: [ 128, 512 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
num-worker: 1
tp: 8
ep: 1
dp-attn: false
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

# 1*DEP8 + 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 64, 256 ]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

# 2*DEP8 + 1*DEP8
- spec-decoding: "mtp"
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ export NCCL_IB_HCA=$IBDEVICES
export SGLANG_USE_AITER=1

export SGLANG_MORI_DISPATCH_DTYPE=auto
export SGLANG_MORI_FP8_COMB=true
export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
export MORI_COMBINE_DTYPE_DECODE=fp8
export SGLANG_MORI_QP_PER_TRANSFER=4
export SGLANG_MORI_NUM_WORKERS=4
export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
Expand All @@ -59,7 +60,7 @@ export MORI_SHMEM_MODE=ISOLATION

# Enable spec v2
export SGLANG_ENABLE_SPEC_V2=1
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0

export SGLANG_LOG_MS=true
export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
Expand Down
35 changes: 27 additions & 8 deletions benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"

# Extract the maximum concurrency from the x-delimited list
BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)

# Dry Run for debugging purpose
DRY_RUN="${DRY_RUN:-0}"

Expand Down Expand Up @@ -184,6 +187,15 @@ else
prefill_enable_two_batch_overlap="false"
fi

# When both DP and EP are enabled, override max-running-requests with max bench concurrency
if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
prefill_dp_ranks=$PREFILL_TP_SIZE
# MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
fi

# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
Expand All @@ -196,6 +208,18 @@ else
decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
fi

# When both DP and EP are enabled, override max-running-requests and dispatch tokens
if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
decode_max_running_requests=$BENCH_MAX_CONC_VALUE
decode_dp_ranks=$DECODE_TP_SIZE
MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
# Update derived variable
SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
fi

# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
Expand Down Expand Up @@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]
DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
# NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
# or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
# 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
# and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising
# that an fast follow PR to fix the evals via having quant correction in the fp8 combine
fi

# =============================================================================
Expand Down Expand Up @@ -398,7 +417,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
fi
set +x
PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
--model-path $MODEL_DIR/$MODEL_NAME \
--disaggregation-mode prefill \
--disaggregation-ib-device ${IBDEVICES} \
Expand Down Expand Up @@ -630,7 +649,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
fi
set +x
PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
--model-path $MODEL_DIR/${MODEL_NAME} \
--disaggregation-mode prefill \
--disaggregation-ib-device ${IBDEVICES} \
Expand Down Expand Up @@ -698,7 +717,7 @@ else
DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
fi
set +x
DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
--model-path ${MODEL_DIR}/${MODEL_NAME} \
--disaggregation-mode decode \
--disaggregation-ib-device ${IBDEVICES} \
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2449,6 +2449,15 @@
- "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1346

- config-keys:
- dsr1-fp4-mi355x-sglang-disagg
- dsr1-fp4-mi355x-sglang-disagg-mtp
description:
- "Fix the eval result of dsr1 fp4 with fp8 blockwise combine"
- "Bump the image to May 19"
- "Add conc 512 new sweep point"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1566

- config-keys:
- kimik2.5-int4-h200-vllm
description:
Expand Down
Loading