From dd551f3b7144673127076fab687f91c7a1af7086 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 23:00:30 -0700 Subject: [PATCH 01/37] fix(profile): upload staged trace path --- .github/workflows/profile.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 8152d47a5..cd2a1d24d 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -171,7 +171,7 @@ jobs: exit 1 fi - trace_path="profile_${res_name}.trace.json.gz" + trace_path="/workspace/profile_${res_name}.trace.json.gz" if [ -f "$trace_path" ]; then echo "trace=$trace_path" >> "$GITHUB_OUTPUT" if [ "${FRAMEWORK}" = "sglang" ]; then @@ -206,7 +206,7 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: profile_${{ env.RESULT_FILENAME }} - path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz + path: ${{ steps.run.outputs.trace }} if-no-files-found: ignore - name: Upload TP-0-DECODE trace as artifact From 7c0cb092f81e7f8d248b737fb34ba3351721bcca Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 23:01:33 -0700 Subject: [PATCH 02/37] feat(profile): add B300 DeepSeek V4 Flash config --- .github/configs/nvidia-master.yaml | 18 ++++++++++++++++++ runners/launch_b300-nv.sh | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..904fa19e3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2038,6 +2038,24 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } +# Targeted single-point Flash config for profile.yml. Keep the existing Pro +# sweep entry above unchanged; this profile-only key reuses the same B300 +# SGLang launch path at the 1k1k, conc=64 point. +dsv4-flash-fp4-b300-sglang: + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } + # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by # DP_ATTENTION: diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index cca8b4ab0..fcc630db9 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -287,7 +287,7 @@ else HF_HUB_CACHE_MOUNT="/data/models" if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" - elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then + elif [[ "$MODEL_PREFIX" == "dsv4" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" From e2639e50340d871e40fdca015549862892fc5dc6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 23:28:25 -0700 Subject: [PATCH 03/37] fix(profile): stage relay trace in checkout --- .github/workflows/profile.yml | 2 +- benchmarks/benchmark_lib.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index cd2a1d24d..ac6d66bd9 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -171,7 +171,7 @@ jobs: exit 1 fi - trace_path="/workspace/profile_${res_name}.trace.json.gz" + trace_path="profile_${res_name}.trace.json.gz" if [ -f "$trace_path" ]; then echo "trace=$trace_path" >> "$GITHUB_OUTPUT" if [ "${FRAMEWORK}" = "sglang" ]; then diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cfd30cd04..23e3e016f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -508,7 +508,7 @@ move_profile_trace_for_relay() { return 0 fi - local dest_trace="/workspace/profile_${RESULT_FILENAME}.trace.json.gz" + local dest_trace="$PWD/profile_${RESULT_FILENAME}.trace.json.gz" if [[ "$trace_file" == *.gz ]]; then cp -f "$trace_file" "$dest_trace" else From 5e87c8c1e9d66998a92a23fcb52e27da681d9305 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 23:42:33 -0700 Subject: [PATCH 04/37] feat(profile): add Flash DEP MTP3 profile --- .github/configs/nvidia-master.yaml | 18 ++++++++++++++++++ .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 10 ++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 904fa19e3..bee175be0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2056,6 +2056,24 @@ dsv4-flash-fp4-b300-sglang: search-space: - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } +# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the +# non-MTP Flash profile above. The shared SGLang MTP launcher selects the +# Flash-only (steps=2, draft-tokens=3) speculative settings for this model. +dsv4-flash-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp } + # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by # DP_ATTENTION: diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 03102778d..b7aad47f7 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -77,11 +77,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + SPECULATIVE_NUM_STEPS=1 + SPECULATIVE_NUM_DRAFT_TOKENS=2 + if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + SPECULATIVE_NUM_STEPS=2 + SPECULATIVE_NUM_DRAFT_TOKENS=3 + fi SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 1 + --speculative-num-steps "$SPECULATIVE_NUM_STEPS" --speculative-eagle-topk 1 - --speculative-num-draft-tokens 2 + --speculative-num-draft-tokens "$SPECULATIVE_NUM_DRAFT_TOKENS" ) PARALLEL_ARGS=( --dp-size "$TP" From b00f85508bccea78beabf6c2a910fb97f39f7b13 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 23:45:26 -0700 Subject: [PATCH 05/37] fix(profile): push traces with repo token --- .github/workflows/profile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index ac6d66bd9..b99f25d1f 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -240,7 +240,7 @@ jobs: repository: SemiAnalysisAI/InferenceX-trace-storage path: storage ref: master - ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }} + token: ${{ secrets.REPO_PAT }} fetch-depth: 0 - name: Push profile to storage repo From a8df66c8cf97f25eed16cb671f1b23cf11eedc61 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 00:09:33 -0700 Subject: [PATCH 06/37] fix(profile): align Flash MTP profiling steps --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index b7aad47f7..04cbb6fd1 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -80,7 +80,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then SPECULATIVE_NUM_STEPS=1 SPECULATIVE_NUM_DRAFT_TOKENS=2 if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - SPECULATIVE_NUM_STEPS=2 + SPECULATIVE_NUM_STEPS=3 SPECULATIVE_NUM_DRAFT_TOKENS=3 fi SPEC_FLAGS=( @@ -117,6 +117,11 @@ else MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" fi +PROFILE_ARGS=() +if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + PROFILE_ARGS+=(--num-continuous-decode-steps 2) +fi + # Print all SGLANG_* env vars to both the CI step log and server.log so the # launch config is auditable from the result artifact alone. { @@ -138,7 +143,8 @@ PYTHONNOUSERSITE=1 sglang serve \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio 0.1 \ "${SPEC_FLAGS[@]}" \ - "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + "${PARALLEL_ARGS[@]}" \ + "${PROFILE_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From e78383e8422b8d7b71401fca34148ca789d0394b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 00:11:44 -0700 Subject: [PATCH 07/37] fix(profile): capture two SGL profiling steps --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 +------- utils/bench_serving/benchmark_serving.py | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 04cbb6fd1..0ce1f016f 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -117,11 +117,6 @@ else MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" fi -PROFILE_ARGS=() -if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILE_ARGS+=(--num-continuous-decode-steps 2) -fi - # Print all SGLANG_* env vars to both the CI step log and server.log so the # launch config is auditable from the result artifact alone. { @@ -143,8 +138,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio 0.1 \ "${SPEC_FLAGS[@]}" \ - "${PARALLEL_ARGS[@]}" \ - "${PROFILE_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 011b413ac..0a79033fe 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -538,7 +538,7 @@ async def warmup_limited_req_fn(): api_url=base_url + "/start_profile", prompt_len=test_prompt_len, output_len=test_output_len, - extra_body={"num_steps": 1, "merge_profiles": True, "profile_by_stage": True}, + extra_body={"num_steps": 2, "merge_profiles": True, "profile_by_stage": True}, logprobs=logprobs, best_of=best_of, multi_modal_content=test_mm_content, From e3393afe4b3332d5b9c797a186884412570cccff Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 10:41:02 -0700 Subject: [PATCH 08/37] fix(profile): enable B300 Flash vLLM traces --- .github/configs/nvidia-master.yaml | 19 ++++++++++++++++++- benchmarks/single_node/dsv4_fp4_b300_vllm.sh | 9 +++++++++ .../single_node/dsv4_fp4_b300_vllm_mtp.sh | 9 +++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bee175be0..5d98e5cf5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2056,9 +2056,26 @@ dsv4-flash-fp4-b300-sglang: search-space: - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } +# Targeted single-point Flash vLLM profile matching the SGLang profile point +# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run. +dsv4-flash-fp4-b300-vllm: + image: vllm/vllm-openai:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } + # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the # non-MTP Flash profile above. The shared SGLang MTP launcher selects the -# Flash-only (steps=2, draft-tokens=3) speculative settings for this model. +# Flash-only (steps=3, draft-tokens=3) speculative settings for this model. dsv4-flash-fp4-b300-sglang-mtp: image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Flash diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh index 92d4bf4ad..8bf458ae4 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh @@ -47,6 +47,14 @@ if [ "${DP_ATTENTION}" = "true" ]; then MOE_ARGS=(--moe-backend deep_gemm_mega_moe) fi +PROFILE_ARGS=() +if [[ "${PROFILE:-}" == "1" ]]; then + PROFILE_ARGS=( + --profiler-config + "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + ) +fi + if [ "${DP_ATTENTION}" = "true" ]; then MAX_NUM_BATCHED_TOKENS=2048 else @@ -76,6 +84,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --no-enable-prefix-caching \ "${EP_ARGS[@]}" \ "${MOE_ARGS[@]}" \ + "${PROFILE_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index cb41a9eb1..f91a62e12 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -44,6 +44,14 @@ else MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 )) fi +PROFILE_ARGS=() +if [[ "${PROFILE:-}" == "1" ]]; then + PROFILE_ARGS=( + --profiler-config + "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + ) +fi + BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN if [ "${EVAL_ONLY}" = "true" ]; then @@ -69,6 +77,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --no-enable-prefix-caching \ "${EP_ARGS[@]}" \ "${MOE_ARGS[@]}" \ + "${PROFILE_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ From 39f914bf06ed2066afb614b612d2ac6a55d9f93e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 13:37:39 -0700 Subject: [PATCH 09/37] feat(profile): add Flash vLLM MTP3 run --- .github/configs/nvidia-master.yaml | 17 +++++++++++++++++ .../single_node/dsv4_fp4_b300_vllm_mtp.sh | 6 +++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5d98e5cf5..0df7372bd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2073,6 +2073,23 @@ dsv4-flash-fp4-b300-vllm: search-space: - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } +# Targeted Flash vLLM MTP profile at the same single-point profile location. +# The shared vLLM MTP launcher selects 3 speculative tokens for this model. +dsv4-flash-fp4-b300-vllm-mtp: + image: vllm/vllm-openai:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 64, conc-end: 64, spec-decoding: mtp } + # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the # non-MTP Flash profile above. The shared SGLang MTP launcher selects the # Flash-only (steps=3, draft-tokens=3) speculative settings for this model. diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index f91a62e12..44fe207d7 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -62,8 +62,12 @@ else SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN" fi -# use 2 speculative tokens for all configs for now +# Keep the existing Pro MTP profile at 2 speculative tokens; Flash uses the +# requested 3-token MTP profile. NUM_SPEC_TOKENS=2 +if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + NUM_SPEC_TOKENS=3 +fi start_gpu_monitor From f9d6523bb456324a1adf8912150024f64a718c39 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 13:38:00 -0700 Subject: [PATCH 10/37] fix(profile): capture one profiling step --- utils/bench_serving/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 0a79033fe..011b413ac 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -538,7 +538,7 @@ async def warmup_limited_req_fn(): api_url=base_url + "/start_profile", prompt_len=test_prompt_len, output_len=test_output_len, - extra_body={"num_steps": 2, "merge_profiles": True, "profile_by_stage": True}, + extra_body={"num_steps": 1, "merge_profiles": True, "profile_by_stage": True}, logprobs=logprobs, best_of=best_of, multi_modal_content=test_mm_content, From 2e2f87623c422325c9a3ef0b0a8b3f864653c135 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 13:45:06 -0700 Subject: [PATCH 11/37] fix(profile): switch Flash vLLM MTP to DEP8 --- .github/configs/nvidia-master.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0df7372bd..6605dd90b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2073,8 +2073,9 @@ dsv4-flash-fp4-b300-vllm: search-space: - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } -# Targeted Flash vLLM MTP profile at the same single-point profile location. -# The shared vLLM MTP launcher selects 3 speculative tokens for this model. +# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile +# location. The shared launcher maps dp-attn=true to DP without TP, and selects +# 3 speculative tokens for this model. dsv4-flash-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Flash @@ -2088,7 +2089,7 @@ dsv4-flash-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 64, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp } # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the # non-MTP Flash profile above. The shared SGLang MTP launcher selects the From cd160ee5b3cf1cd7570737835561ea35a8668109 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 14:53:48 -0700 Subject: [PATCH 12/37] fix(profile): rerun Flash vLLM MTP at conc8 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6605dd90b..346c27531 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2089,7 +2089,7 @@ dsv4-flash-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp } # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the # non-MTP Flash profile above. The shared SGLang MTP launcher selects the From 9b534f7b1311ba009d427345884a42ca9a13cda4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 16:07:01 -0700 Subject: [PATCH 13/37] fix(profile): disable Flash vLLM MTP cudagraphs --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 44fe207d7..25a7f8c15 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -52,6 +52,14 @@ if [[ "${PROFILE:-}" == "1" ]]; then ) fi +COMPILATION_ARGS=( + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --max-cudagraph-capture-size 2048 +) +if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + COMPILATION_ARGS=(--compilation-config '{"cudagraph_mode":"NONE","custom_ops":["all"]}') +fi + BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN if [ "${EVAL_ONLY}" = "true" ]; then @@ -82,13 +90,12 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${EP_ARGS[@]}" \ "${MOE_ARGS[@]}" \ "${PROFILE_ARGS[@]}" \ - --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ + "${COMPILATION_ARGS[@]}" \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ - --max-cudagraph-capture-size 2048 \ --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & From 4f1f0fa8e0ead6ff4cecf831e64f1549161de2cc Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 19:05:08 -0700 Subject: [PATCH 14/37] fix(profile): limit Flash vLLM trace to decode steps --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 25a7f8c15..0beb9e8c9 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -46,9 +46,13 @@ fi PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":2,\"active_iterations\":2,\"torch_profiler_with_stack\":false}" + fi PROFILE_ARGS=( --profiler-config - "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + "$PROFILER_CONFIG" ) fi From fc21e40e15cd092b249902a39fdc42b628c9a144 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 20:48:26 -0700 Subject: [PATCH 15/37] fix(profile): disable Flash vLLM torch compile --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 0beb9e8c9..d334dc222 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -61,7 +61,7 @@ COMPILATION_ARGS=( --max-cudagraph-capture-size 2048 ) if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - COMPILATION_ARGS=(--compilation-config '{"cudagraph_mode":"NONE","custom_ops":["all"]}') + COMPILATION_ARGS=(--compilation-config '{"mode":0,"cudagraph_mode":"NONE","custom_ops":["all"]}') fi BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN From 5eb4b6535ca5231e5ba607f05ae7aa76bb49d9d1 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 21:06:26 -0700 Subject: [PATCH 16/37] fix(profile): capture three Flash vLLM decode steps --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index d334dc222..831d78e77 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -48,7 +48,7 @@ PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":2,\"active_iterations\":2,\"torch_profiler_with_stack\":false}" + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":3,\"active_iterations\":3,\"torch_profiler_with_stack\":false}" fi PROFILE_ARGS=( --profiler-config From 39f3b7c6354b1dab5719b97fa82820a303169f06 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 21:18:16 -0700 Subject: [PATCH 17/37] fix(profile): enable Flash vLLM cudagraphs --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 831d78e77..bd5cdc4ba 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -61,7 +61,10 @@ COMPILATION_ARGS=( --max-cudagraph-capture-size 2048 ) if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - COMPILATION_ARGS=(--compilation-config '{"mode":0,"cudagraph_mode":"NONE","custom_ops":["all"]}') + COMPILATION_ARGS=( + --compilation-config '{"mode":0,"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --max-cudagraph-capture-size 2048 + ) fi BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN From ef11755fb9978a0e16ae1f129296f274f81db015 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 21:48:59 -0700 Subject: [PATCH 18/37] fix(profile): capture eight Flash vLLM decode steps --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index bd5cdc4ba..e7dc28348 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -48,7 +48,7 @@ PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":3,\"active_iterations\":3,\"torch_profiler_with_stack\":false}" + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":8,\"active_iterations\":8,\"torch_profiler_with_stack\":false}" fi PROFILE_ARGS=( --profiler-config From 22db6e26906d802111cd9356cb268c04b8be7bb3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 22:42:54 -0700 Subject: [PATCH 19/37] fix(profile): use compatible Flash cudagraph config --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index e7dc28348..250e62c01 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -62,7 +62,7 @@ COMPILATION_ARGS=( ) if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then COMPILATION_ARGS=( - --compilation-config '{"mode":0,"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' --max-cudagraph-capture-size 2048 ) fi From 81f5a8aaff10c6a09e0025250f9856df98849431 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 21 May 2026 23:11:07 -0700 Subject: [PATCH 20/37] fix(profile): run five Flash vLLM decode steps --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 250e62c01..9fed4a3b5 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -67,6 +67,11 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then ) fi +SCHEDULER_ARGS=() +if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + SCHEDULER_ARGS=(--num-scheduler-steps 5) +fi + BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN if [ "${EVAL_ONLY}" = "true" ]; then @@ -98,6 +103,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${MOE_ARGS[@]}" \ "${PROFILE_ARGS[@]}" \ "${COMPILATION_ARGS[@]}" \ + "${SCHEDULER_ARGS[@]}" \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ From a37cb7340b0a0138be7b6c4c56e6e28435418568 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 22 May 2026 08:38:10 -0700 Subject: [PATCH 21/37] fix(profile): use vLLM profiler window for Flash steps --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 9fed4a3b5..2a8001eb0 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -48,7 +48,7 @@ PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":8,\"active_iterations\":8,\"torch_profiler_with_stack\":false}" + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":5,\"active_iterations\":5,\"torch_profiler_with_stack\":false}" fi PROFILE_ARGS=( --profiler-config @@ -67,11 +67,6 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then ) fi -SCHEDULER_ARGS=() -if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - SCHEDULER_ARGS=(--num-scheduler-steps 5) -fi - BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN if [ "${EVAL_ONLY}" = "true" ]; then @@ -103,7 +98,6 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${MOE_ARGS[@]}" \ "${PROFILE_ARGS[@]}" \ "${COMPILATION_ARGS[@]}" \ - "${SCHEDULER_ARGS[@]}" \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ From 6f89c90ec406c6ffc4e1084eefdc89be088ad59f Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 22 May 2026 08:59:06 -0700 Subject: [PATCH 22/37] fix(profile): limit Flash vLLM request length --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 2a8001eb0..22de88435 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -68,6 +68,15 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then fi BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN +BENCHMARK_OUTPUT_LEN=$OSL +BENCHMARK_NUM_PROMPTS=$((CONC * 10)) +BENCHMARK_MAX_CONCURRENCY=$CONC + +if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + BENCHMARK_OUTPUT_LEN=5 + BENCHMARK_NUM_PROMPTS=1 + BENCHMARK_MAX_CONCURRENCY=1 +fi if [ "${EVAL_ONLY}" = "true" ]; then EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") @@ -120,10 +129,10 @@ run_benchmark_serving \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ - --output-len "$OSL" \ + --output-len "$BENCHMARK_OUTPUT_LEN" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ + --num-prompts "$BENCHMARK_NUM_PROMPTS" \ + --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ --trust-remote-code \ From e9bfbf9838687d4a3aa4b578590507e75fa24b41 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 22 May 2026 09:45:44 -0700 Subject: [PATCH 23/37] fix(results): guard zero tpot interactivity --- utils/process_result.py | 4 +++- utils/test_process_result.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/utils/process_result.py b/utils/process_result.py index 4603287bc..2010c09ff 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -1,5 +1,6 @@ import sys import json +import math import os from pathlib import Path @@ -128,8 +129,9 @@ def get_required_env_vars(required_vars): if key.endswith('ms'): data[key.replace('_ms', '')] = float(value) / 1000.0 if 'tpot' in key: + tpot_ms = float(value) data[key.replace('_ms', '').replace( - 'tpot', 'intvty')] = 1000.0 / float(value) + 'tpot', 'intvty')] = 1000.0 / tpot_ms if math.isfinite(tpot_ms) and tpot_ms > 0 else 0.0 print(json.dumps(data, indent=2)) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index e3903c6e6..edeba20be 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -342,6 +342,22 @@ def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars): assert output_data["intvty_p50"] == pytest.approx(50.0) assert output_data["intvty_p99"] == pytest.approx(20.0) + def test_zero_tpot_interactivity_is_guarded(self, tmp_path, single_node_env_vars): + """Test that zero TPOT fields do not crash interactivity conversion.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 1, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "mean_tpot_ms": 0.0, + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["mean_intvty"] == pytest.approx(0.0) + def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars): """Test throughput per GPU calculation for single node.""" benchmark_result = { From 64cbdc34b0cf097a69d1f5c1f827b4aec5154d71 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 25 May 2026 22:23:11 -0700 Subject: [PATCH 24/37] fix(profile): simulate Flash offline decode batch --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 22de88435..780a8427d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -48,7 +48,7 @@ PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":5,\"active_iterations\":5,\"torch_profiler_with_stack\":false}" + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}" fi PROFILE_ARGS=( --profiler-config @@ -73,9 +73,9 @@ BENCHMARK_NUM_PROMPTS=$((CONC * 10)) BENCHMARK_MAX_CONCURRENCY=$CONC if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - BENCHMARK_OUTPUT_LEN=5 - BENCHMARK_NUM_PROMPTS=1 - BENCHMARK_MAX_CONCURRENCY=1 + BENCHMARK_OUTPUT_LEN=1 + BENCHMARK_NUM_PROMPTS=256 + BENCHMARK_MAX_CONCURRENCY=256 fi if [ "${EVAL_ONLY}" = "true" ]; then From 6a824fcf5f3e2e211ad511049c3256de60df2789 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 25 May 2026 22:27:37 -0700 Subject: [PATCH 25/37] fix(profile): warm up Flash decode batch before trace --- benchmarks/benchmark_lib.sh | 12 +++++++++++- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 23e3e016f..e6980d0c1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -178,6 +178,7 @@ wait_for_server_ready() { # --max-concurrency: Max concurrency # --result-filename: Result filename without extension # --result-dir: Result directory +# --num-warmups: Optional warmup request count before benchmark/profile # --use-chat-template: Optional flag to enable chat template # --dsv4: Optional flag to use the DeepSeek-V4 chat template # (encoding_dsv4.py) instead of the tokenizer's built-in jinja @@ -204,6 +205,7 @@ run_benchmark_serving() { local result_filename="" local result_dir="" local workspace_dir="" + local num_warmups="" local use_chat_template=false local dsv4=false local trust_remote_code=false @@ -259,6 +261,10 @@ run_benchmark_serving() { workspace_dir="$2" shift 2 ;; + --num-warmups) + num_warmups="$2" + shift 2 + ;; --use-chat-template) use_chat_template=true shift @@ -341,6 +347,10 @@ run_benchmark_serving() { num_prompts="$max_concurrency" fi + if [[ -z "$num_warmups" ]]; then + num_warmups="$((2 * max_concurrency))" + fi + # Build benchmark command local benchmark_cmd=( python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py" @@ -357,7 +367,7 @@ run_benchmark_serving() { --ignore-eos "${profile_flag[@]}" --save-result - --num-warmups "$((2 * max_concurrency))" \ + --num-warmups "$num_warmups" \ --percentile-metrics 'ttft,tpot,itl,e2el' --result-dir "$result_dir" --result-filename "$result_filename.json" diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 780a8427d..451ece01f 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -71,11 +71,13 @@ BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN BENCHMARK_OUTPUT_LEN=$OSL BENCHMARK_NUM_PROMPTS=$((CONC * 10)) BENCHMARK_MAX_CONCURRENCY=$CONC +BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY)) if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then BENCHMARK_OUTPUT_LEN=1 BENCHMARK_NUM_PROMPTS=256 BENCHMARK_MAX_CONCURRENCY=256 + BENCHMARK_NUM_WARMUPS=4096 fi if [ "${EVAL_ONLY}" = "true" ]; then @@ -133,6 +135,7 @@ run_benchmark_serving \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ --num-prompts "$BENCHMARK_NUM_PROMPTS" \ --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \ + --num-warmups "$BENCHMARK_NUM_WARMUPS" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ --trust-remote-code \ From 3b7d8a7e3843b5ba67af4ebb5bba7d92cfb93ac2 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 08:56:46 -0700 Subject: [PATCH 26/37] fix(profile): target third Flash decode step --- benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 451ece01f..efda4024d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -48,7 +48,7 @@ PROFILE_ARGS=() if [[ "${PROFILE:-}" == "1" ]]; then PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}" + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":3,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}" fi PROFILE_ARGS=( --profiler-config @@ -74,7 +74,7 @@ BENCHMARK_MAX_CONCURRENCY=$CONC BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY)) if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - BENCHMARK_OUTPUT_LEN=1 + BENCHMARK_OUTPUT_LEN=3 BENCHMARK_NUM_PROMPTS=256 BENCHMARK_MAX_CONCURRENCY=256 BENCHMARK_NUM_WARMUPS=4096 From 524ca637bac017889af69b448d209c2a8e3ce218 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 09:54:09 -0700 Subject: [PATCH 27/37] fix(profile): add GB200 DSV4 MTP3 profile --- .github/configs/nvidia-master.yaml | 31 ++++ .github/workflows/profile.yml | 118 ++++++++++++- ...sagg-gb200-profile-16gpu-conc256-mtp3.yaml | 156 ++++++++++++++++++ 3 files changed, 296 insertions(+), 9 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 346c27531..f86fe9f47 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8680,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch +# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256. +dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [256] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index b99f25d1f..5e904c9a3 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -113,6 +113,17 @@ jobs: EP_SIZE: ${{ matrix.config.ep }} DP_ATTENTION: ${{ matrix.config['dp-attn'] }} CONC: ${{ matrix.config.conc }} + CONC_JSON: ${{ toJson(matrix.config.conc) }} + PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }} + PREFILL_TP: ${{ matrix.config.prefill.tp }} + PREFILL_EP: ${{ matrix.config.prefill.ep }} + PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }} + PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }} + DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }} + DECODE_TP: ${{ matrix.config.decode.tp }} + DECODE_EP: ${{ matrix.config.decode.ep }} + DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }} + DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }} SPEC_DECODING: ${{ matrix.config.spec-decoding }} DISAGG: ${{ matrix.config.disagg }} MOE_DEBUG: '0' @@ -148,7 +159,7 @@ jobs: ref: ${{ inputs.ref || github.sha }} clean: false - - name: Launch + Profile (single-node sglang/vllm) + - name: Launch + Profile id: run env: RUNNER_NAME: ${{ runner.name }} @@ -159,19 +170,108 @@ jobs: shell: bash run: | set -euo pipefail - ep_val="${EP_SIZE:-1}" - res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" + + export_additional_settings() { + local settings_json="$1" + python3 - "$settings_json" <<'PY' + import json + import sys + + raw = sys.argv[1] + if not raw or raw == "null": + raise SystemExit(0) + for item in json.loads(raw) or []: + print(item) + PY + } + + normalize_conc() { + python3 - <<'PY' + import json + import os + + raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]" + try: + value = json.loads(raw) + except json.JSONDecodeError: + value = raw + if isinstance(value, list): + print("x".join(str(v) for v in value)) + else: + print(str(value)) + PY + } + + if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then + conc_val="$(normalize_conc)" + res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}" + + echo "IS_MULTINODE=true" >> "$GITHUB_ENV" + echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV" + echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV" + + while IFS= read -r setting; do + if [ -n "$setting" ]; then + export "$setting" + fi + done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}") + while IFS= read -r setting; do + if [ -n "$setting" ]; then + export "$setting" + fi + done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}") + else + ep_val="${EP_SIZE:-1}" + res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" + fi + export RESULT_FILENAME="${res_name}" echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV" bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ ! -f "${res_name}.json" ]; then - echo "Run failed: Benchmark result ${res_name}.json not found." >&2 - exit 1 + result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)" + if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then + cp "$result_candidate" "${res_name}.json" + else + echo "Run failed: Benchmark result ${res_name}.json not found." >&2 + exit 1 + fi fi trace_path="profile_${res_name}.trace.json.gz" + if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then + trace_candidate="$(python3 - <<'PY' + from pathlib import Path + + root = Path("LOGS/profiles") + candidates = [ + p for p in root.rglob("*") + if p.is_file() and ( + p.name.endswith(".trace.json") + or p.name.endswith(".trace.json.gz") + or p.name.endswith(".pt.trace.json") + or p.name.endswith(".json") + ) + ] + candidates = [ + p for p in candidates + if not p.name.startswith("results_") and "profile_export" not in p.name + ] + if candidates: + print(max(candidates, key=lambda p: p.stat().st_size)) + PY + )" + if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then + if [[ "$trace_candidate" == *.gz ]]; then + cp "$trace_candidate" "$trace_path" + else + gzip -c "$trace_candidate" > "$trace_path" + fi + fi + fi + if [ -f "$trace_path" ]; then echo "trace=$trace_path" >> "$GITHUB_OUTPUT" if [ "${FRAMEWORK}" = "sglang" ]; then @@ -252,7 +352,7 @@ jobs: run: | set -euo pipefail - dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" + dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}" mkdir -p "$dest_dir" cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz" @@ -260,13 +360,13 @@ jobs: git config user.name "github-actions" git config user.email "github-actions@github.com" git add -A - git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit" + git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit" git push STORAGE_SHA="$(git rev-parse HEAD)" popd >/dev/null - export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz" - export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" + export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz" + export TITLE="${RESULT_FILENAME}" enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')" enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml new file mode 100644 index 000000000..1842ebf30 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml @@ -0,0 +1,156 @@ +name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3" + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":3}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9472 + max-num-seqs: 8 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":3}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9472 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +profiling: + type: "torch" + prefill: + start_step: 100000 + stop_step: 100001 + decode: + start_step: 3 + stop_step: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf" + num_prompts_mult: 1 + num_warmup_mult: 1 + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.21.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" From 58d423ed201ef4cd6fe8d5cfd1efcb088937ed7c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 09:55:45 -0700 Subject: [PATCH 28/37] fix(profile): stringify multinode concurrency env --- .github/workflows/profile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 5e904c9a3..2bc40984d 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -112,7 +112,7 @@ jobs: TP: ${{ matrix.config.tp }} EP_SIZE: ${{ matrix.config.ep }} DP_ATTENTION: ${{ matrix.config['dp-attn'] }} - CONC: ${{ matrix.config.conc }} + CONC: ${{ toJson(matrix.config.conc) }} CONC_JSON: ${{ toJson(matrix.config.conc) }} PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }} PREFILL_TP: ${{ matrix.config.prefill.tp }} From 2f300a36f254bafe5c409e1207d0e3421ffcf3c1 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 10:01:42 -0700 Subject: [PATCH 29/37] fix(profile): use aggregate GB200 DSV4 profile --- .github/configs/nvidia-master.yaml | 18 ++--- ...agg-gb200-profile-16gpu-conc256-mtp3.yaml} | 71 +++---------------- 2 files changed, 19 insertions(+), 70 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-profile-16gpu-conc256-mtp3.yaml => agg-gb200-profile-16gpu-conc256-mtp3.yaml} (56%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f86fe9f47..8b555d5f0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8681,7 +8681,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: dp-attn: true # Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch -# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256. +# 256 shape: aggregated DEP16 on GB200, MTP3, conc=256. dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro @@ -8690,7 +8690,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: precision: fp4 framework: dynamo-vllm multinode: true - disagg: true + disagg: false scenarios: fixed-seq-len: - isl: 8192 @@ -8700,16 +8700,16 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: spec-decoding: mtp prefill: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml" decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + num-worker: 0 + tp: 16 + ep: 1 + dp-attn: false dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml similarity index 56% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml index 1842ebf30..f2f0c6ece 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml @@ -1,4 +1,4 @@ -name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3" +name: "svf-vllm-agg-gb200-profile-16gpu-conc256-mtp3" model: path: "deepseek-v4-pro" @@ -21,15 +21,9 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true + agg_nodes: 4 + agg_workers: 1 + gpus_per_agg: 16 frontend: type: dynamo @@ -38,7 +32,7 @@ frontend: backend: type: vllm connector: null - prefill_environment: + aggregated_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -54,67 +48,25 @@ backend: UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + aggregated: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-hybrid-lb: true - data-parallel-size: 8 + data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true moe-backend: deep_gemm_mega_moe - enforce-eager: true speculative-config: '{"method":"mtp","num_speculative_tokens":3}' attention-config: '{"use_fp4_indexer_cache":true}' - max-model-len: 9472 - max-num-seqs: 8 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.9 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-hybrid-lb: true - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe - speculative-config: '{"method":"mtp","num_speculative_tokens":3}' - attention-config: '{"use_fp4_indexer_cache":true}' max-model-len: 9472 max-num-seqs: 256 - max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 + max-cudagraph-capture-size: 256 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -124,14 +76,11 @@ backend: stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" profiling: type: "torch" - prefill: - start_step: 100000 - stop_step: 100001 - decode: + aggregated: start_step: 3 stop_step: 4 From eb885ff46107f516085c20991e8659b439a5a55e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 10:27:45 -0700 Subject: [PATCH 30/37] fix(profile): enable vllm trace output for GB200 --- .github/workflows/profile.yml | 11 +++++++++-- .../8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml | 3 +++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 2bc40984d..1ee135560 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -241,17 +241,19 @@ jobs: fi trace_path="profile_${res_name}.trace.json.gz" - if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then + if [ ! -f "$trace_path" ] && [ -d LOGS ]; then trace_candidate="$(python3 - <<'PY' from pathlib import Path - root = Path("LOGS/profiles") + root = Path("LOGS") candidates = [ p for p in root.rglob("*") if p.is_file() and ( p.name.endswith(".trace.json") or p.name.endswith(".trace.json.gz") or p.name.endswith(".pt.trace.json") + or p.name.endswith(".pt.trace.json.gz") + or p.name.endswith(".json.gz") or p.name.endswith(".json") ) ] @@ -293,6 +295,11 @@ jobs: fi else echo "Profile trace not found: $trace_path" >&2 + if [ -d LOGS ]; then + echo "LOGS profile candidates:" >&2 + find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true + fi + exit 1 fi - name: Process result (json -> agg) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml index f2f0c6ece..51c8774d3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml @@ -34,6 +34,8 @@ backend: connector: null aggregated_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_RPC_TIMEOUT: "1800000" + VLLM_TORCH_PROFILER_DIR: "/logs/profiles/agg" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" TORCH_SYMMMEM: "NVSHMEM" @@ -67,6 +69,7 @@ backend: max-num-seqs: 256 max-num-batched-tokens: 256 max-cudagraph-capture-size: 256 + profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true From 27ddec5ba9d29052e5dd685a4d476c9b9c303027 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 11:12:02 -0700 Subject: [PATCH 31/37] fix(profile): capture later GB200 decode step --- .github/configs/nvidia-master.yaml | 8 ++++---- .../8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8b555d5f0..e69ee9c47 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8694,20 +8694,20 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: scenarios: fixed-seq-len: - isl: 8192 - osl: 1024 + osl: 2048 search-space: - conc-list: [256] spec-decoding: mtp prefill: - num-worker: 1 - tp: 16 + num-worker: 16 + tp: 1 ep: 16 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml" decode: num-worker: 0 - tp: 16 + tp: 1 ep: 1 dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml index 51c8774d3..e75634c9b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml @@ -65,11 +65,11 @@ backend: speculative-config: '{"method":"mtp","num_speculative_tokens":3}' attention-config: '{"use_fp4_indexer_cache":true}' tokenizer-mode: deepseek_v4 - max-model-len: 9472 + max-model-len: 10496 max-num-seqs: 256 max-num-batched-tokens: 256 max-cudagraph-capture-size: 256 - profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' + profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -84,17 +84,17 @@ backend: profiling: type: "torch" aggregated: - start_step: 3 - stop_step: 4 + start_step: 2304 + stop_step: 2305 benchmark: type: "sa-bench" isl: 8192 - osl: 1024 + osl: 2048 concurrencies: "256" req_rate: "inf" num_prompts_mult: 1 - num_warmup_mult: 1 + num_warmup_mult: 4 use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" From 0dc030042ffaef4524208225cdfe6e481811d59e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 12:22:40 -0700 Subject: [PATCH 32/37] fix(profile): switch GB200 profile to DSV4 Flash --- .github/configs/nvidia-master.yaml | 6 +++--- .github/workflows/profile.yml | 4 +++- ...b200-flash-profile-16gpu-conc256-mtp3.yaml} | 9 ++++----- runners/launch_gb200-nv.sh | 18 ++++++++++++++---- 4 files changed, 24 insertions(+), 13 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{agg-gb200-profile-16gpu-conc256-mtp3.yaml => agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml} (92%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e69ee9c47..c74fb1ff9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8682,9 +8682,9 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: # Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch # 256 shape: aggregated DEP16 on GB200, MTP3, conc=256. -dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: +dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile: image: vllm/vllm-openai:v0.21.0-ubuntu2404 - model: deepseek-ai/DeepSeek-V4-Pro + model: deepseek-ai/DeepSeek-V4-Flash model-prefix: dsv4 runner: gb200 precision: fp4 @@ -8704,7 +8704,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile: ep: 16 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml" decode: num-worker: 0 tp: 1 diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 1ee135560..b545be9f1 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -355,6 +355,7 @@ jobs: id: push env: TRACE_LOCAL: ${{ steps.run.outputs.trace }} + REPO_PAT: ${{ secrets.REPO_PAT }} shell: bash run: | set -euo pipefail @@ -366,9 +367,10 @@ jobs: pushd storage >/dev/null git config user.name "github-actions" git config user.email "github-actions@github.com" + git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git" git add -A git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit" - git push + git push origin HEAD:master STORAGE_SHA="$(git rev-parse HEAD)" popd >/dev/null diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml similarity index 92% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml index e75634c9b..afbeaa659 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml @@ -1,7 +1,7 @@ -name: "svf-vllm-agg-gb200-profile-16gpu-conc256-mtp3" +name: "svf-vllm-agg-gb200-flash-profile-16gpu-conc256-mtp3" model: - path: "deepseek-v4-pro" + path: "deepseek-v4-flash" container: "vllm/vllm-openai:v0.21.0-ubuntu2404" precision: "fp4" @@ -52,7 +52,7 @@ backend: NCCL_P2P_LEVEL: NVL vllm_config: aggregated: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + served-model-name: "deepseek-ai/DeepSeek-V4-Flash" kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 @@ -100,8 +100,7 @@ benchmark: identity: model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + repo: "deepseek-ai/DeepSeek-V4-Flash" container: image: "vllm/vllm-openai:v0.21.0-ubuntu2404" frameworks: diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dada98bd6..3ccb772f0 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -19,8 +19,13 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then # Same compute-node-local NVMe path as the dynamo-vllm dsv4 # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX # matches the model.path alias in our DSV4 sglang recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" + else + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + fi else export MODEL_PATH=$MODEL fi @@ -52,8 +57,13 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the # model.path alias in our DSV4 recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" + else + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + fi else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 From 16d77dc0f3d4321f75a2a902cf1e9af778419b30 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 15:15:10 -0700 Subject: [PATCH 33/37] fix(profile): point GB200 Flash to Lustre weights --- runners/launch_gb200-nv.sh | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 3ccb772f0..e12767899 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -16,11 +16,10 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Same compute-node-local NVMe path as the dynamo-vllm dsv4 - # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX - # matches the model.path alias in our DSV4 sglang recipes. + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 + # sglang recipes. if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/" + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" else export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" @@ -54,11 +53,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre - # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the - # model.path alias in our DSV4 recipes. + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 + # recipes. if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/" + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" else export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" From 980316a71c2d24291c91462302490a329ac220d3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 18:19:16 -0700 Subject: [PATCH 34/37] fix(profile): use 4-GPU GB200 Flash profile --- .github/configs/nvidia-master.yaml | 10 +++++----- ... => agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml} | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml => agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml} (95%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c74fb1ff9..cb56f9a07 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8680,8 +8680,8 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true -# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch -# 256 shape: aggregated DEP16 on GB200, MTP3, conc=256. +# Dedicated profile point for a single GB200 node / global batch 256 shape: +# aggregated DEP4 on GB200, MTP3, conc=256. dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile: image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Flash @@ -8699,12 +8699,12 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile: - conc-list: [256] spec-decoding: mtp prefill: - num-worker: 16 + num-worker: 4 tp: 1 - ep: 16 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml" decode: num-worker: 0 tp: 1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml similarity index 95% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml index afbeaa659..e5c39f8af 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml @@ -1,4 +1,4 @@ -name: "svf-vllm-agg-gb200-flash-profile-16gpu-conc256-mtp3" +name: "svf-vllm-agg-gb200-flash-profile-4gpu-conc256-mtp3" model: path: "deepseek-v4-flash" @@ -21,9 +21,9 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - agg_nodes: 4 + agg_nodes: 1 agg_workers: 1 - gpus_per_agg: 16 + gpus_per_agg: 4 frontend: type: dynamo @@ -57,7 +57,7 @@ backend: tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-hybrid-lb: true - data-parallel-size: 16 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true From 47d860d427dc918bc3a970b179c3cfe6dc048337 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 20:17:58 -0700 Subject: [PATCH 35/37] fix: avoid stale profile trace reuse --- .github/workflows/profile.yml | 74 +++++++++++++++++++++++++++-------- runners/launch_gb200-nv.sh | 1 + 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index b545be9f1..e2d08430f 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -228,6 +228,10 @@ jobs: export RESULT_FILENAME="${res_name}" echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV" + echo "Removing stale profile artifacts from previous runs" + rm -rf LOGS + rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz + bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ ! -f "${res_name}.json" ]; then @@ -246,26 +250,22 @@ jobs: from pathlib import Path root = Path("LOGS") - candidates = [ - p for p in root.rglob("*") - if p.is_file() and ( - p.name.endswith(".trace.json") - or p.name.endswith(".trace.json.gz") - or p.name.endswith(".pt.trace.json") - or p.name.endswith(".pt.trace.json.gz") - or p.name.endswith(".json.gz") - or p.name.endswith(".json") - ) - ] - candidates = [ - p for p in candidates - if not p.name.startswith("results_") and "profile_export" not in p.name - ] + + def is_trace_candidate(path: Path) -> bool: + name = path.name + if name.startswith("results_") or "profile_export" in name: + return False + if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")): + return True + return "trace" in name and name.endswith((".json", ".json.gz")) + + candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)] if candidates: - print(max(candidates, key=lambda p: p.stat().st_size)) + print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size))) PY )" if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then + echo "Selected profile trace candidate: $trace_candidate" if [[ "$trace_candidate" == *.gz ]]; then cp "$trace_candidate" "$trace_path" else @@ -275,6 +275,48 @@ jobs: fi if [ -f "$trace_path" ]; then + echo "Profile trace prepared: $trace_path" + ls -lh "$trace_path" + sha256sum "$trace_path" + python3 - "$trace_path" <<'PY' + import gzip + import os + import re + import sys + + trace_path = sys.argv[1] + expected = set() + worker_gpus = [] + for workers_key, tp_key in ( + ("PREFILL_NUM_WORKERS", "PREFILL_TP"), + ("DECODE_NUM_WORKERS", "DECODE_TP"), + ): + workers = os.environ.get(workers_key) + tp = os.environ.get(tp_key) + if workers and workers.isdigit() and tp and tp.isdigit(): + gpus = int(workers) * int(tp) + if gpus: + expected.add(gpus) + worker_gpus.append(gpus) + if len(worker_gpus) > 1: + expected.add(sum(worker_gpus)) + + opener = gzip.open if trace_path.endswith(".gz") else open + with opener(trace_path, "rt", errors="replace") as f: + prefix = f.read(1024 * 1024) + + if '"traceEvents"' not in prefix: + raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start") + + match = re.search(r'"world_size"\s*:\s*(\d+)', prefix) + if expected and match: + world_size = int(match.group(1)) + if world_size not in expected: + allowed = ", ".join(str(v) for v in sorted(expected)) + raise SystemExit( + f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}" + ) + PY echo "trace=$trace_path" >> "$GITHUB_OUTPUT" if [ "${FRAMEWORK}" = "sglang" ]; then # Try to locate corresponding TP-0 traces produced by SGLang profiler diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index e12767899..ed4824ef5 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -304,6 +304,7 @@ echo "Collecting results..." if [ -d "$LOGS_DIR" ]; then echo "Found logs directory: $LOGS_DIR" + rm -rf "$GITHUB_WORKSPACE/LOGS" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . else From f44c2242809393717a4ca49020dc76af71626f35 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 21:36:30 -0700 Subject: [PATCH 36/37] fix: prioritize decode in gb200 flash profile --- .github/configs/nvidia-master.yaml | 2 +- .../agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cb56f9a07..72a8ca70f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8694,7 +8694,7 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile: scenarios: fixed-seq-len: - isl: 8192 - osl: 2048 + osl: 256 search-space: - conc-list: [256] spec-decoding: mtp diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml index e5c39f8af..a3acb5407 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml @@ -65,11 +65,11 @@ backend: speculative-config: '{"method":"mtp","num_speculative_tokens":3}' attention-config: '{"use_fp4_indexer_cache":true}' tokenizer-mode: deepseek_v4 - max-model-len: 10496 + max-model-len: 8704 max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-batched-tokens: 2048 max-cudagraph-capture-size: 256 - profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' + profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1152,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -84,13 +84,13 @@ backend: profiling: type: "torch" aggregated: - start_step: 2304 - stop_step: 2305 + start_step: 1152 + stop_step: 1153 benchmark: type: "sa-bench" isl: 8192 - osl: 2048 + osl: 256 concurrencies: "256" req_rate: "inf" num_prompts_mult: 1 From 4ae1fe68bb4e008576346bd71f86be8de07ad63d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 26 May 2026 22:57:50 -0700 Subject: [PATCH 37/37] fix: profile final gb200 flash decode step --- .../8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml index a3acb5407..00cf06b78 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml @@ -69,7 +69,7 @@ backend: max-num-seqs: 256 max-num-batched-tokens: 2048 max-cudagraph-capture-size: 256 - profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1152,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' + profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1296,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -84,8 +84,8 @@ backend: profiling: type: "torch" aggregated: - start_step: 1152 - stop_step: 1153 + start_step: 1296 + stop_step: 1297 benchmark: type: "sa-bench"