From fbddaa30d0fa5405779e55839a39081b926d20cd Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 10:12:59 +0800 Subject: [PATCH 1/7] Update config --- .github/configs/amd-master.yaml | 6 +++--- benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 418ad5ab9..bea585dab 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1774,7 +1774,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # image tag, so bumping sglang is just an image tag bump here. Sweeps # DP-attention on/off and EP=8. dsv4-fp4-mi355x-sglang: - image: rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4 + image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x @@ -1786,12 +1786,12 @@ dsv4-fp4-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 4096 } - { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 32 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 512 } + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 } # DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh index 571132a54..b26fa6033 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh @@ -116,6 +116,8 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --attention-backend compressed \ --max-running-requests ${CONC} \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ --page-size 256 \ --context-length $MAX_MODEL_LEN \ --chunked-prefill-size 8192 \ From f3870db3865f97247365da81f4e811bc727d435b Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 10:24:41 +0800 Subject: [PATCH 2/7] Update change log --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 614b6104e..4838ddf5e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3137,3 +3137,9 @@ - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 +- config-keys: + - dsv4-fp4-mi355x-sglang + description: + - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4" + - "Add args to avoid kvcache pool full issue on high conc" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568 From 11daefb38650b271ccafc800ebbec8344f21e969 Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 10:29:04 +0800 Subject: [PATCH 3/7] Update change log --- perf-changelog.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a3c697a75..e84cb9688 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3138,16 +3138,15 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 - config-keys: -<<<<<<< dsv4-mi355-sgl-0526 - dsv4-fp4-mi355x-sglang description: - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4" - "Add args to avoid kvcache pool full issue on high conc" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568 -======= + +- config-keys: - qwen3.5-fp8-h200-sglang - dsr1-fp8-mi355x-sglang description: - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 ->>>>>>> main From 44f19c08e765ed3282238c13e1defa5306296e6b Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 12:57:27 +0800 Subject: [PATCH 4/7] Update config --- benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh index b26fa6033..ec9ebce02 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh @@ -117,7 +117,7 @@ python3 -m sglang.launch_server \ --attention-backend compressed \ --max-running-requests ${CONC} \ --mem-fraction-static 0.90 \ - --swa-full-tokens-ratio 0.1 \ + --swa-full-tokens-ratio 0.2 \ --page-size 256 \ --context-length $MAX_MODEL_LEN \ --chunked-prefill-size 8192 \ From a618b9e76659290c62a0aee884cfb89df84140b0 Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 12:58:31 +0800 Subject: [PATCH 5/7] Update config --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index bea585dab..fc4ba15a7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1786,7 +1786,7 @@ dsv4-fp4-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 4096 } + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } - { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 32 } - isl: 8192 osl: 1024 From 6deed239b272e943e00bc8a34a1de63cc2e3fea3 Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 13:54:25 +0800 Subject: [PATCH 6/7] Update config --- benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh index ec9ebce02..3a307ed0b 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh @@ -117,7 +117,7 @@ python3 -m sglang.launch_server \ --attention-backend compressed \ --max-running-requests ${CONC} \ --mem-fraction-static 0.90 \ - --swa-full-tokens-ratio 0.2 \ + --swa-full-tokens-ratio 0.15 \ --page-size 256 \ --context-length $MAX_MODEL_LEN \ --chunked-prefill-size 8192 \ From 3ad3afeb8039f107cc625021d73fc8908af7df36 Mon Sep 17 00:00:00 2001 From: thomawan Date: Wed, 27 May 2026 14:03:42 +0800 Subject: [PATCH 7/7] Update config --- benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh index 3a307ed0b..a4976bdb0 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh @@ -77,7 +77,7 @@ export SGLANG_FORCE_TRITON_MOE_FP8=0 export SGLANG_HACK_FLASHMLA_BACKEND=triton export SGLANG_OPT_USE_TILELANG_INDEXER=true export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true -export AITER_BF16_FP8_MOE_BOUND=1 +export AITER_BF16_FP8_MOE_BOUND=0 export SGLANG_OPT_FUSE_WQA_WKV=true export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0