From dd551f3b7144673127076fab687f91c7a1af7086 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 23:00:30 -0700
Subject: [PATCH 01/37] fix(profile): upload staged trace path

---
 .github/workflows/profile.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index 8152d47a5..cd2a1d24d 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -171,7 +171,7 @@ jobs:
             exit 1
           fi
 
-          trace_path="profile_${res_name}.trace.json.gz"
+          trace_path="/workspace/profile_${res_name}.trace.json.gz"
           if [ -f "$trace_path" ]; then
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
@@ -206,7 +206,7 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}
-          path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
+          path: ${{ steps.run.outputs.trace }}
           if-no-files-found: ignore
 
       - name: Upload TP-0-DECODE trace as artifact

From 7c0cb092f81e7f8d248b737fb34ba3351721bcca Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 23:01:33 -0700
Subject: [PATCH 02/37] feat(profile): add B300 DeepSeek V4 Flash config

---
 .github/configs/nvidia-master.yaml | 18 ++++++++++++++++++
 runners/launch_b300-nv.sh          |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4a683d372..904fa19e3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2038,6 +2038,24 @@ dsv4-fp4-b300-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
+# Targeted single-point Flash config for profile.yml. Keep the existing Pro
+# sweep entry above unchanged; this profile-only key reuses the same B300
+# SGLang launch path at the 1k1k, conc=64 point.
+dsv4-flash-fp4-b300-sglang:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
   # DP_ATTENTION:
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index cca8b4ab0..fcc630db9 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -287,7 +287,7 @@ else
     HF_HUB_CACHE_MOUNT="/data/models"
     if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
         export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
-    elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
+    elif [[ "$MODEL_PREFIX" == "dsv4" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
         export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"

From e2639e50340d871e40fdca015549862892fc5dc6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 23:28:25 -0700
Subject: [PATCH 03/37] fix(profile): stage relay trace in checkout

---
 .github/workflows/profile.yml | 2 +-
 benchmarks/benchmark_lib.sh   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index cd2a1d24d..ac6d66bd9 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -171,7 +171,7 @@ jobs:
             exit 1
           fi
 
-          trace_path="/workspace/profile_${res_name}.trace.json.gz"
+          trace_path="profile_${res_name}.trace.json.gz"
           if [ -f "$trace_path" ]; then
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cfd30cd04..23e3e016f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -508,7 +508,7 @@ move_profile_trace_for_relay() {
         return 0
     fi
 
-    local dest_trace="/workspace/profile_${RESULT_FILENAME}.trace.json.gz"
+    local dest_trace="$PWD/profile_${RESULT_FILENAME}.trace.json.gz"
     if [[ "$trace_file" == *.gz ]]; then
         cp -f "$trace_file" "$dest_trace"
     else

From 5e87c8c1e9d66998a92a23fcb52e27da681d9305 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 23:42:33 -0700
Subject: [PATCH 04/37] feat(profile): add Flash DEP MTP3 profile

---
 .github/configs/nvidia-master.yaml             | 18 ++++++++++++++++++
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh    | 10 ++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 904fa19e3..bee175be0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2056,6 +2056,24 @@ dsv4-flash-fp4-b300-sglang:
       search-space:
       - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
 
+# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
+# non-MTP Flash profile above. The shared SGLang MTP launcher selects the
+# Flash-only (steps=2, draft-tokens=3) speculative settings for this model.
+dsv4-flash-fp4-b300-sglang-mtp:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
   # DP_ATTENTION:
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 03102778d..b7aad47f7 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -77,11 +77,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    SPECULATIVE_NUM_STEPS=1
+    SPECULATIVE_NUM_DRAFT_TOKENS=2
+    if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+        SPECULATIVE_NUM_STEPS=2
+        SPECULATIVE_NUM_DRAFT_TOKENS=3
+    fi
     SPEC_FLAGS=(
         --speculative-algorithm EAGLE
-        --speculative-num-steps 1
+        --speculative-num-steps "$SPECULATIVE_NUM_STEPS"
         --speculative-eagle-topk 1
-        --speculative-num-draft-tokens 2
+        --speculative-num-draft-tokens "$SPECULATIVE_NUM_DRAFT_TOKENS"
     )
     PARALLEL_ARGS=(
         --dp-size "$TP"

From b00f85508bccea78beabf6c2a910fb97f39f7b13 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 23:45:26 -0700
Subject: [PATCH 05/37] fix(profile): push traces with repo token

---
 .github/workflows/profile.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index ac6d66bd9..b99f25d1f 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -240,7 +240,7 @@ jobs:
           repository: SemiAnalysisAI/InferenceX-trace-storage
           path: storage
           ref: master
-          ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }}
+          token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
 
       - name: Push profile to storage repo

From a8df66c8cf97f25eed16cb671f1b23cf11eedc61 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 00:09:33 -0700
Subject: [PATCH 06/37] fix(profile): align Flash MTP profiling steps

---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index b7aad47f7..04cbb6fd1 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -80,7 +80,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     SPECULATIVE_NUM_STEPS=1
     SPECULATIVE_NUM_DRAFT_TOKENS=2
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        SPECULATIVE_NUM_STEPS=2
+        SPECULATIVE_NUM_STEPS=3
         SPECULATIVE_NUM_DRAFT_TOKENS=3
     fi
     SPEC_FLAGS=(
@@ -117,6 +117,11 @@ else
     MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
 fi
 
+PROFILE_ARGS=()
+if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    PROFILE_ARGS+=(--num-continuous-decode-steps 2)
+fi
+
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
 # launch config is auditable from the result artifact alone.
 {
@@ -138,7 +143,8 @@ PYTHONNOUSERSITE=1 sglang serve \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio 0.1 \
     "${SPEC_FLAGS[@]}" \
-    "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
+    "${PARALLEL_ARGS[@]}" \
+    "${PROFILE_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From e78383e8422b8d7b71401fca34148ca789d0394b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 00:11:44 -0700
Subject: [PATCH 07/37] fix(profile): capture two SGL profiling steps

---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 +-------
 utils/bench_serving/benchmark_serving.py           | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 04cbb6fd1..0ce1f016f 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -117,11 +117,6 @@ else
     MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
 fi
 
-PROFILE_ARGS=()
-if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    PROFILE_ARGS+=(--num-continuous-decode-steps 2)
-fi
-
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
 # launch config is auditable from the result artifact alone.
 {
@@ -143,8 +138,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio 0.1 \
     "${SPEC_FLAGS[@]}" \
-    "${PARALLEL_ARGS[@]}" \
-    "${PROFILE_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
+    "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py
index 011b413ac..0a79033fe 100644
--- a/utils/bench_serving/benchmark_serving.py
+++ b/utils/bench_serving/benchmark_serving.py
@@ -538,7 +538,7 @@ async def warmup_limited_req_fn():
                                          api_url=base_url + "/start_profile",
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
-                                         extra_body={"num_steps": 1, "merge_profiles": True, "profile_by_stage": True},
+                                         extra_body={"num_steps": 2, "merge_profiles": True, "profile_by_stage": True},
                                          logprobs=logprobs,
                                          best_of=best_of,
                                          multi_modal_content=test_mm_content,

From e3393afe4b3332d5b9c797a186884412570cccff Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 10:41:02 -0700
Subject: [PATCH 08/37] fix(profile): enable B300 Flash vLLM traces

---
 .github/configs/nvidia-master.yaml            | 19 ++++++++++++++++++-
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh  |  9 +++++++++
 .../single_node/dsv4_fp4_b300_vllm_mtp.sh     |  9 +++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bee175be0..5d98e5cf5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2056,9 +2056,26 @@ dsv4-flash-fp4-b300-sglang:
       search-space:
       - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
 
+# Targeted single-point Flash vLLM profile matching the SGLang profile point
+# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run.
+dsv4-flash-fp4-b300-vllm:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
 # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
 # non-MTP Flash profile above. The shared SGLang MTP launcher selects the
-# Flash-only (steps=2, draft-tokens=3) speculative settings for this model.
+# Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
 dsv4-flash-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
   model: deepseek-ai/DeepSeek-V4-Flash
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 92d4bf4ad..8bf458ae4 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -47,6 +47,14 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
 fi
 
+PROFILE_ARGS=()
+if [[ "${PROFILE:-}" == "1" ]]; then
+    PROFILE_ARGS=(
+        --profiler-config
+        "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+    )
+fi
+
 if [ "${DP_ATTENTION}" = "true" ]; then
     MAX_NUM_BATCHED_TOKENS=2048
 else
@@ -76,6 +84,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --no-enable-prefix-caching \
     "${EP_ARGS[@]}" \
     "${MOE_ARGS[@]}" \
+    "${PROFILE_ARGS[@]}" \
     --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index cb41a9eb1..f91a62e12 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -44,6 +44,14 @@ else
     MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
 fi
 
+PROFILE_ARGS=()
+if [[ "${PROFILE:-}" == "1" ]]; then
+    PROFILE_ARGS=(
+        --profiler-config
+        "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+    )
+fi
+
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -69,6 +77,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --no-enable-prefix-caching \
     "${EP_ARGS[@]}" \
     "${MOE_ARGS[@]}" \
+    "${PROFILE_ARGS[@]}" \
     --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \

From 39f914bf06ed2066afb614b612d2ac6a55d9f93e Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 13:37:39 -0700
Subject: [PATCH 09/37] feat(profile): add Flash vLLM MTP3 run

---
 .github/configs/nvidia-master.yaml              | 17 +++++++++++++++++
 .../single_node/dsv4_fp4_b300_vllm_mtp.sh       |  6 +++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 5d98e5cf5..0df7372bd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2073,6 +2073,23 @@ dsv4-flash-fp4-b300-vllm:
       search-space:
       - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
 
+# Targeted Flash vLLM MTP profile at the same single-point profile location.
+# The shared vLLM MTP launcher selects 3 speculative tokens for this model.
+dsv4-flash-fp4-b300-vllm-mtp:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+
 # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
 # non-MTP Flash profile above. The shared SGLang MTP launcher selects the
 # Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index f91a62e12..44fe207d7 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -62,8 +62,12 @@ else
     SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
 fi
 
-# use 2 speculative tokens for all configs for now
+# Keep the existing Pro MTP profile at 2 speculative tokens; Flash uses the
+# requested 3-token MTP profile.
 NUM_SPEC_TOKENS=2
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    NUM_SPEC_TOKENS=3
+fi
 
 start_gpu_monitor
 

From f9d6523bb456324a1adf8912150024f64a718c39 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 13:38:00 -0700
Subject: [PATCH 10/37] fix(profile): capture one profiling step

---
 utils/bench_serving/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py
index 0a79033fe..011b413ac 100644
--- a/utils/bench_serving/benchmark_serving.py
+++ b/utils/bench_serving/benchmark_serving.py
@@ -538,7 +538,7 @@ async def warmup_limited_req_fn():
                                          api_url=base_url + "/start_profile",
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
-                                         extra_body={"num_steps": 2, "merge_profiles": True, "profile_by_stage": True},
+                                         extra_body={"num_steps": 1, "merge_profiles": True, "profile_by_stage": True},
                                          logprobs=logprobs,
                                          best_of=best_of,
                                          multi_modal_content=test_mm_content,

From 2e2f87623c422325c9a3ef0b0a8b3f864653c135 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 13:45:06 -0700
Subject: [PATCH 11/37] fix(profile): switch Flash vLLM MTP to DEP8

---
 .github/configs/nvidia-master.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0df7372bd..6605dd90b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2073,8 +2073,9 @@ dsv4-flash-fp4-b300-vllm:
       search-space:
       - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
 
-# Targeted Flash vLLM MTP profile at the same single-point profile location.
-# The shared vLLM MTP launcher selects 3 speculative tokens for this model.
+# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile
+# location. The shared launcher maps dp-attn=true to DP without TP, and selects
+# 3 speculative tokens for this model.
 dsv4-flash-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Flash
@@ -2088,7 +2089,7 @@ dsv4-flash-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
 
 # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
 # non-MTP Flash profile above. The shared SGLang MTP launcher selects the

From cd160ee5b3cf1cd7570737835561ea35a8668109 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 14:53:48 -0700
Subject: [PATCH 12/37] fix(profile): rerun Flash vLLM MTP at conc8

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6605dd90b..346c27531 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2089,7 +2089,7 @@ dsv4-flash-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp }
 
 # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
 # non-MTP Flash profile above. The shared SGLang MTP launcher selects the

From 9b534f7b1311ba009d427345884a42ca9a13cda4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 16:07:01 -0700
Subject: [PATCH 13/37] fix(profile): disable Flash vLLM MTP cudagraphs

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 44fe207d7..25a7f8c15 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -52,6 +52,14 @@ if [[ "${PROFILE:-}" == "1" ]]; then
     )
 fi
 
+COMPILATION_ARGS=(
+    --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+    --max-cudagraph-capture-size 2048
+)
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    COMPILATION_ARGS=(--compilation-config '{"cudagraph_mode":"NONE","custom_ops":["all"]}')
+fi
+
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -82,13 +90,12 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     "${EP_ARGS[@]}" \
     "${MOE_ARGS[@]}" \
     "${PROFILE_ARGS[@]}" \
-    --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
+    "${COMPILATION_ARGS[@]}" \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 \
-    --max-cudagraph-capture-size 2048 \
     --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

From 4f1f0fa8e0ead6ff4cecf831e64f1549161de2cc Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 19:05:08 -0700
Subject: [PATCH 14/37] fix(profile): limit Flash vLLM trace to decode steps

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 25a7f8c15..0beb9e8c9 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -46,9 +46,13 @@ fi
 
 PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
+    PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+    if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":2,\"active_iterations\":2,\"torch_profiler_with_stack\":false}"
+    fi
     PROFILE_ARGS=(
         --profiler-config
-        "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+        "$PROFILER_CONFIG"
     )
 fi
 

From fc21e40e15cd092b249902a39fdc42b628c9a144 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 20:48:26 -0700
Subject: [PATCH 15/37] fix(profile): disable Flash vLLM torch compile

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 0beb9e8c9..d334dc222 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -61,7 +61,7 @@ COMPILATION_ARGS=(
     --max-cudagraph-capture-size 2048
 )
 if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    COMPILATION_ARGS=(--compilation-config '{"cudagraph_mode":"NONE","custom_ops":["all"]}')
+    COMPILATION_ARGS=(--compilation-config '{"mode":0,"cudagraph_mode":"NONE","custom_ops":["all"]}')
 fi
 
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN

From 5eb4b6535ca5231e5ba607f05ae7aa76bb49d9d1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 21:06:26 -0700
Subject: [PATCH 16/37] fix(profile): capture three Flash vLLM decode steps

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index d334dc222..831d78e77 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -48,7 +48,7 @@ PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
     PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":2,\"active_iterations\":2,\"torch_profiler_with_stack\":false}"
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":3,\"active_iterations\":3,\"torch_profiler_with_stack\":false}"
     fi
     PROFILE_ARGS=(
         --profiler-config

From 39f3b7c6354b1dab5719b97fa82820a303169f06 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 21:18:16 -0700
Subject: [PATCH 17/37] fix(profile): enable Flash vLLM cudagraphs

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 831d78e77..bd5cdc4ba 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -61,7 +61,10 @@ COMPILATION_ARGS=(
     --max-cudagraph-capture-size 2048
 )
 if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    COMPILATION_ARGS=(--compilation-config '{"mode":0,"cudagraph_mode":"NONE","custom_ops":["all"]}')
+    COMPILATION_ARGS=(
+        --compilation-config '{"mode":0,"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+        --max-cudagraph-capture-size 2048
+    )
 fi
 
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN

From ef11755fb9978a0e16ae1f129296f274f81db015 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 21:48:59 -0700
Subject: [PATCH 18/37] fix(profile): capture eight Flash vLLM decode steps

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index bd5cdc4ba..e7dc28348 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -48,7 +48,7 @@ PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
     PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":3,\"active_iterations\":3,\"torch_profiler_with_stack\":false}"
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":8,\"active_iterations\":8,\"torch_profiler_with_stack\":false}"
     fi
     PROFILE_ARGS=(
         --profiler-config

From 22db6e26906d802111cd9356cb268c04b8be7bb3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 22:42:54 -0700
Subject: [PATCH 19/37] fix(profile): use compatible Flash cudagraph config

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index e7dc28348..250e62c01 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -62,7 +62,7 @@ COMPILATION_ARGS=(
 )
 if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
     COMPILATION_ARGS=(
-        --compilation-config '{"mode":0,"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+        --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
         --max-cudagraph-capture-size 2048
     )
 fi

From 81f5a8aaff10c6a09e0025250f9856df98849431 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 21 May 2026 23:11:07 -0700
Subject: [PATCH 20/37] fix(profile): run five Flash vLLM decode steps

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 250e62c01..9fed4a3b5 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -67,6 +67,11 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
     )
 fi
 
+SCHEDULER_ARGS=()
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    SCHEDULER_ARGS=(--num-scheduler-steps 5)
+fi
+
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -98,6 +103,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     "${MOE_ARGS[@]}" \
     "${PROFILE_ARGS[@]}" \
     "${COMPILATION_ARGS[@]}" \
+    "${SCHEDULER_ARGS[@]}" \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
     --tool-call-parser deepseek_v4 \

From a37cb7340b0a0138be7b6c4c56e6e28435418568 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 22 May 2026 08:38:10 -0700
Subject: [PATCH 21/37] fix(profile): use vLLM profiler window for Flash steps

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 9fed4a3b5..2a8001eb0 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -48,7 +48,7 @@ PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
     PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":8,\"active_iterations\":8,\"torch_profiler_with_stack\":false}"
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":5,\"active_iterations\":5,\"torch_profiler_with_stack\":false}"
     fi
     PROFILE_ARGS=(
         --profiler-config
@@ -67,11 +67,6 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
     )
 fi
 
-SCHEDULER_ARGS=()
-if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    SCHEDULER_ARGS=(--num-scheduler-steps 5)
-fi
-
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -103,7 +98,6 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     "${MOE_ARGS[@]}" \
     "${PROFILE_ARGS[@]}" \
     "${COMPILATION_ARGS[@]}" \
-    "${SCHEDULER_ARGS[@]}" \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
     --tool-call-parser deepseek_v4 \

From 6f89c90ec406c6ffc4e1084eefdc89be088ad59f Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 22 May 2026 08:59:06 -0700
Subject: [PATCH 22/37] fix(profile): limit Flash vLLM request length

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 2a8001eb0..22de88435 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -68,6 +68,15 @@ if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
 fi
 
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
+BENCHMARK_OUTPUT_LEN=$OSL
+BENCHMARK_NUM_PROMPTS=$((CONC * 10))
+BENCHMARK_MAX_CONCURRENCY=$CONC
+
+if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    BENCHMARK_OUTPUT_LEN=5
+    BENCHMARK_NUM_PROMPTS=1
+    BENCHMARK_MAX_CONCURRENCY=1
+fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
@@ -120,10 +129,10 @@ run_benchmark_serving \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
-    --output-len "$OSL" \
+    --output-len "$BENCHMARK_OUTPUT_LEN" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 10))" \
-    --max-concurrency "$CONC" \
+    --num-prompts "$BENCHMARK_NUM_PROMPTS" \
+    --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/ \
     --trust-remote-code \

From e9bfbf9838687d4a3aa4b578590507e75fa24b41 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 22 May 2026 09:45:44 -0700
Subject: [PATCH 23/37] fix(results): guard zero tpot interactivity

---
 utils/process_result.py      |  4 +++-
 utils/test_process_result.py | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/utils/process_result.py b/utils/process_result.py
index 4603287bc..2010c09ff 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -1,5 +1,6 @@
 import sys
 import json
+import math
 import os
 from pathlib import Path
 
@@ -128,8 +129,9 @@ def get_required_env_vars(required_vars):
     if key.endswith('ms'):
         data[key.replace('_ms', '')] = float(value) / 1000.0
     if 'tpot' in key:
+        tpot_ms = float(value)
         data[key.replace('_ms', '').replace(
-            'tpot', 'intvty')] = 1000.0 / float(value)
+            'tpot', 'intvty')] = 1000.0 / tpot_ms if math.isfinite(tpot_ms) and tpot_ms > 0 else 0.0
 
 print(json.dumps(data, indent=2))
 
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index e3903c6e6..edeba20be 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -342,6 +342,22 @@ def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars):
         assert output_data["intvty_p50"] == pytest.approx(50.0)
         assert output_data["intvty_p99"] == pytest.approx(20.0)
 
+    def test_zero_tpot_interactivity_is_guarded(self, tmp_path, single_node_env_vars):
+        """Test that zero TPOT fields do not crash interactivity conversion."""
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 1,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 800.0,
+            "mean_tpot_ms": 0.0,
+        }
+
+        result = run_script(tmp_path, single_node_env_vars, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        output_data = json.loads(result.stdout)
+        assert output_data["mean_intvty"] == pytest.approx(0.0)
+
     def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars):
         """Test throughput per GPU calculation for single node."""
         benchmark_result = {

From 64cbdc34b0cf097a69d1f5c1f827b4aec5154d71 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 25 May 2026 22:23:11 -0700
Subject: [PATCH 24/37] fix(profile): simulate Flash offline decode batch

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 22de88435..780a8427d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -48,7 +48,7 @@ PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
     PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":5,\"active_iterations\":5,\"torch_profiler_with_stack\":false}"
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}"
     fi
     PROFILE_ARGS=(
         --profiler-config
@@ -73,9 +73,9 @@ BENCHMARK_NUM_PROMPTS=$((CONC * 10))
 BENCHMARK_MAX_CONCURRENCY=$CONC
 
 if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    BENCHMARK_OUTPUT_LEN=5
-    BENCHMARK_NUM_PROMPTS=1
-    BENCHMARK_MAX_CONCURRENCY=1
+    BENCHMARK_OUTPUT_LEN=1
+    BENCHMARK_NUM_PROMPTS=256
+    BENCHMARK_MAX_CONCURRENCY=256
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then

From 6a824fcf5f3e2e211ad511049c3256de60df2789 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 25 May 2026 22:27:37 -0700
Subject: [PATCH 25/37] fix(profile): warm up Flash decode batch before trace

---
 benchmarks/benchmark_lib.sh                      | 12 +++++++++++-
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh |  3 +++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 23e3e016f..e6980d0c1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -178,6 +178,7 @@ wait_for_server_ready() {
 #   --max-concurrency: Max concurrency
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
+#   --num-warmups: Optional warmup request count before benchmark/profile
 #   --use-chat-template: Optional flag to enable chat template
 #   --dsv4: Optional flag to use the DeepSeek-V4 chat template
 #           (encoding_dsv4.py) instead of the tokenizer's built-in jinja
@@ -204,6 +205,7 @@ run_benchmark_serving() {
     local result_filename=""
     local result_dir=""
     local workspace_dir=""
+    local num_warmups=""
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
@@ -259,6 +261,10 @@ run_benchmark_serving() {
                 workspace_dir="$2"
                 shift 2
                 ;;
+            --num-warmups)
+                num_warmups="$2"
+                shift 2
+                ;;
             --use-chat-template)
                 use_chat_template=true
                 shift
@@ -341,6 +347,10 @@ run_benchmark_serving() {
         num_prompts="$max_concurrency"
     fi
 
+    if [[ -z "$num_warmups" ]]; then
+        num_warmups="$((2 * max_concurrency))"
+    fi
+
     # Build benchmark command
     local benchmark_cmd=(
         python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py"
@@ -357,7 +367,7 @@ run_benchmark_serving() {
         --ignore-eos
         "${profile_flag[@]}"
         --save-result
-        --num-warmups "$((2 * max_concurrency))" \
+        --num-warmups "$num_warmups" \
         --percentile-metrics 'ttft,tpot,itl,e2el'
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 780a8427d..451ece01f 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -71,11 +71,13 @@ BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 BENCHMARK_OUTPUT_LEN=$OSL
 BENCHMARK_NUM_PROMPTS=$((CONC * 10))
 BENCHMARK_MAX_CONCURRENCY=$CONC
+BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY))
 
 if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
     BENCHMARK_OUTPUT_LEN=1
     BENCHMARK_NUM_PROMPTS=256
     BENCHMARK_MAX_CONCURRENCY=256
+    BENCHMARK_NUM_WARMUPS=4096
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -133,6 +135,7 @@ run_benchmark_serving \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
     --num-prompts "$BENCHMARK_NUM_PROMPTS" \
     --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \
+    --num-warmups "$BENCHMARK_NUM_WARMUPS" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/ \
     --trust-remote-code \

From 3b7d8a7e3843b5ba67af4ebb5bba7d92cfb93ac2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 08:56:46 -0700
Subject: [PATCH 26/37] fix(profile): target third Flash decode step

---
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 451ece01f..efda4024d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -48,7 +48,7 @@ PROFILE_ARGS=()
 if [[ "${PROFILE:-}" == "1" ]]; then
     PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
     if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":1,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}"
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":3,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}"
     fi
     PROFILE_ARGS=(
         --profiler-config
@@ -74,7 +74,7 @@ BENCHMARK_MAX_CONCURRENCY=$CONC
 BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY))
 
 if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-    BENCHMARK_OUTPUT_LEN=1
+    BENCHMARK_OUTPUT_LEN=3
     BENCHMARK_NUM_PROMPTS=256
     BENCHMARK_MAX_CONCURRENCY=256
     BENCHMARK_NUM_WARMUPS=4096

From 524ca637bac017889af69b448d209c2a8e3ce218 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 09:54:09 -0700
Subject: [PATCH 27/37] fix(profile): add GB200 DSV4 MTP3 profile

---
 .github/configs/nvidia-master.yaml            |  31 ++++
 .github/workflows/profile.yml                 | 118 ++++++++++++-
 ...sagg-gb200-profile-16gpu-conc256-mtp3.yaml | 156 ++++++++++++++++++
 3 files changed, 296 insertions(+), 9 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 346c27531..f86fe9f47 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8680,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           ep: 8
           dp-attn: true
 
+# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
+# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256.
+dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - conc-list: [256]
+        spec-decoding: mtp
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
 dsv4-fp4-b300-dynamo-vllm:
   image: vllm/vllm-openai:v0.20.1
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index b99f25d1f..5e904c9a3 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -113,6 +113,17 @@ jobs:
       EP_SIZE: ${{ matrix.config.ep }}
       DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
       CONC: ${{ matrix.config.conc }}
+      CONC_JSON: ${{ toJson(matrix.config.conc) }}
+      PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
+      PREFILL_TP: ${{ matrix.config.prefill.tp }}
+      PREFILL_EP: ${{ matrix.config.prefill.ep }}
+      PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }}
+      PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }}
+      DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }}
+      DECODE_TP: ${{ matrix.config.decode.tp }}
+      DECODE_EP: ${{ matrix.config.decode.ep }}
+      DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }}
+      DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }}
       SPEC_DECODING: ${{ matrix.config.spec-decoding }}
       DISAGG: ${{ matrix.config.disagg }}
       MOE_DEBUG: '0'
@@ -148,7 +159,7 @@ jobs:
           ref: ${{ inputs.ref || github.sha }}
           clean: false
 
-      - name: Launch + Profile (single-node sglang/vllm)
+      - name: Launch + Profile
         id: run
         env:
           RUNNER_NAME: ${{ runner.name }}
@@ -159,19 +170,108 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          ep_val="${EP_SIZE:-1}"
-          res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+
+          export_additional_settings() {
+            local settings_json="$1"
+            python3 - "$settings_json" <<'PY'
+          import json
+          import sys
+
+          raw = sys.argv[1]
+          if not raw or raw == "null":
+              raise SystemExit(0)
+          for item in json.loads(raw) or []:
+              print(item)
+          PY
+          }
+
+          normalize_conc() {
+            python3 - <<'PY'
+          import json
+          import os
+
+          raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]"
+          try:
+              value = json.loads(raw)
+          except json.JSONDecodeError:
+              value = raw
+          if isinstance(value, list):
+              print("x".join(str(v) for v in value))
+          else:
+              print(str(value))
+          PY
+          }
+
+          if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then
+            conc_val="$(normalize_conc)"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}"
+
+            echo "IS_MULTINODE=true" >> "$GITHUB_ENV"
+            echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV"
+            echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV"
+
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}")
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}")
+          else
+            ep_val="${EP_SIZE:-1}"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+          fi
+
           export RESULT_FILENAME="${res_name}"
           echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
           if [ ! -f "${res_name}.json" ]; then
-            echo "Run failed: Benchmark result ${res_name}.json not found." >&2
-            exit 1
+            result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)"
+            if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then
+              cp "$result_candidate" "${res_name}.json"
+            else
+              echo "Run failed: Benchmark result ${res_name}.json not found." >&2
+              exit 1
+            fi
           fi
 
           trace_path="profile_${res_name}.trace.json.gz"
+          if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then
+            trace_candidate="$(python3 - <<'PY'
+          from pathlib import Path
+
+          root = Path("LOGS/profiles")
+          candidates = [
+              p for p in root.rglob("*")
+              if p.is_file() and (
+                  p.name.endswith(".trace.json")
+                  or p.name.endswith(".trace.json.gz")
+                  or p.name.endswith(".pt.trace.json")
+                  or p.name.endswith(".json")
+              )
+          ]
+          candidates = [
+              p for p in candidates
+              if not p.name.startswith("results_") and "profile_export" not in p.name
+          ]
+          if candidates:
+              print(max(candidates, key=lambda p: p.stat().st_size))
+          PY
+          )"
+            if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
+              if [[ "$trace_candidate" == *.gz ]]; then
+                cp "$trace_candidate" "$trace_path"
+              else
+                gzip -c "$trace_candidate" > "$trace_path"
+              fi
+            fi
+          fi
+
           if [ -f "$trace_path" ]; then
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
@@ -252,7 +352,7 @@ jobs:
         run: |
           set -euo pipefail
 
-          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}"
           mkdir -p "$dest_dir"
           cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"
 
@@ -260,13 +360,13 @@ jobs:
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
           git add -A
-          git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
+          git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
           git push
           STORAGE_SHA="$(git rev-parse HEAD)"
           popd >/dev/null
 
-          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
-          export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz"
+          export TITLE="${RESULT_FILENAME}"
 
           enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
           enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml
new file mode 100644
index 000000000..1842ebf30
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml
@@ -0,0 +1,156 @@
+name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3"
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      enforce-eager: true
+      speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9472
+      max-num-seqs: 8
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+profiling:
+  type: "torch"
+  prefill:
+    start_step: 100000
+    stop_step: 100001
+  decode:
+    start_step: 3
+    stop_step: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256"
+  req_rate: "inf"
+  num_prompts_mult: 1
+  num_warmup_mult: 1
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"

From 58d423ed201ef4cd6fe8d5cfd1efcb088937ed7c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 09:55:45 -0700
Subject: [PATCH 28/37] fix(profile): stringify multinode concurrency env

---
 .github/workflows/profile.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index 5e904c9a3..2bc40984d 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -112,7 +112,7 @@ jobs:
       TP: ${{ matrix.config.tp }}
       EP_SIZE: ${{ matrix.config.ep }}
       DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
-      CONC: ${{ matrix.config.conc }}
+      CONC: ${{ toJson(matrix.config.conc) }}
       CONC_JSON: ${{ toJson(matrix.config.conc) }}
       PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
       PREFILL_TP: ${{ matrix.config.prefill.tp }}

From 2f300a36f254bafe5c409e1207d0e3421ffcf3c1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 10:01:42 -0700
Subject: [PATCH 29/37] fix(profile): use aggregate GB200 DSV4 profile

---
 .github/configs/nvidia-master.yaml            | 18 ++---
 ...agg-gb200-profile-16gpu-conc256-mtp3.yaml} | 71 +++----------------
 2 files changed, 19 insertions(+), 70 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-profile-16gpu-conc256-mtp3.yaml => agg-gb200-profile-16gpu-conc256-mtp3.yaml} (56%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f86fe9f47..8b555d5f0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8681,7 +8681,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           dp-attn: true
 
 # Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
-# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256.
+# 256 shape: aggregated DEP16 on GB200, MTP3, conc=256.
 dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
   image: vllm/vllm-openai:v0.21.0-ubuntu2404
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -8690,7 +8690,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
   precision: fp4
   framework: dynamo-vllm
   multinode: true
-  disagg: true
+  disagg: false
   scenarios:
     fixed-seq-len:
     - isl: 8192
@@ -8700,16 +8700,16 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
         spec-decoding: mtp
         prefill:
           num-worker: 1
-          tp: 8
-          ep: 8
+          tp: 16
+          ep: 16
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml"
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
         decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
+          num-worker: 0
+          tp: 16
+          ep: 1
+          dp-attn: false
 
 dsv4-fp4-b300-dynamo-vllm:
   image: vllm/vllm-openai:v0.20.1
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
similarity index 56%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
index 1842ebf30..f2f0c6ece 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
@@ -1,4 +1,4 @@
-name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3"
+name: "svf-vllm-agg-gb200-profile-16gpu-conc256-mtp3"
 
 model:
   path: "deepseek-v4-pro"
@@ -21,15 +21,9 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-infra:
-  etcd_nats_dedicated_node: true
+  agg_nodes: 4
+  agg_workers: 1
+  gpus_per_agg: 16
 
 frontend:
   type: dynamo
@@ -38,7 +32,7 @@ frontend:
 backend:
   type: vllm
   connector: null
-  prefill_environment:
+  aggregated_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
@@ -54,67 +48,25 @@ backend:
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
     NCCL_P2P_LEVEL: NVL
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    TORCH_SYMMMEM: "NVSHMEM"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
   vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+    aggregated:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-hybrid-lb: true
-      data-parallel-size: 8
+      data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enable-ep-weight-filter: true
       moe-backend: deep_gemm_mega_moe
-      enforce-eager: true
       speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
       attention-config: '{"use_fp4_indexer_cache":true}'
-      max-model-len: 9472
-      max-num-seqs: 8
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      no-async-scheduling: true
-      block-size: 256
-      gpu-memory-utilization: 0.9
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      numa-bind: true
       tokenizer-mode: deepseek_v4
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-hybrid-lb: true
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enable-ep-weight-filter: true
-      moe-backend: deep_gemm_mega_moe
-      speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
-      attention-config: '{"use_fp4_indexer_cache":true}'
       max-model-len: 9472
       max-num-seqs: 256
-      max-cudagraph-capture-size: 256
       max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 256
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -124,14 +76,11 @@ backend:
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
+      all2all-backend: "flashinfer_nvlink_one_sided"
 
 profiling:
   type: "torch"
-  prefill:
-    start_step: 100000
-    stop_step: 100001
-  decode:
+  aggregated:
     start_step: 3
     stop_step: 4
 

From eb885ff46107f516085c20991e8659b439a5a55e Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 10:27:45 -0700
Subject: [PATCH 30/37] fix(profile): enable vllm trace output for GB200

---
 .github/workflows/profile.yml                         | 11 +++++++++--
 .../8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml    |  3 +++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index 2bc40984d..1ee135560 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -241,17 +241,19 @@ jobs:
           fi
 
           trace_path="profile_${res_name}.trace.json.gz"
-          if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then
+          if [ ! -f "$trace_path" ] && [ -d LOGS ]; then
             trace_candidate="$(python3 - <<'PY'
           from pathlib import Path
 
-          root = Path("LOGS/profiles")
+          root = Path("LOGS")
           candidates = [
               p for p in root.rglob("*")
               if p.is_file() and (
                   p.name.endswith(".trace.json")
                   or p.name.endswith(".trace.json.gz")
                   or p.name.endswith(".pt.trace.json")
+                  or p.name.endswith(".pt.trace.json.gz")
+                  or p.name.endswith(".json.gz")
                   or p.name.endswith(".json")
               )
           ]
@@ -293,6 +295,11 @@ jobs:
             fi
           else
             echo "Profile trace not found: $trace_path" >&2
+            if [ -d LOGS ]; then
+              echo "LOGS profile candidates:" >&2
+              find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true
+            fi
+            exit 1
           fi
 
       - name: Process result (json -> agg)
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
index f2f0c6ece..51c8774d3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
@@ -34,6 +34,8 @@ backend:
   connector: null
   aggregated_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_RPC_TIMEOUT: "1800000"
+    VLLM_TORCH_PROFILER_DIR: "/logs/profiles/agg"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     TORCH_SYMMMEM: "NVSHMEM"
@@ -67,6 +69,7 @@ backend:
       max-num-seqs: 256
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 256
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true

From 27ddec5ba9d29052e5dd685a4d476c9b9c303027 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 11:12:02 -0700
Subject: [PATCH 31/37] fix(profile): capture later GB200 decode step

---
 .github/configs/nvidia-master.yaml                   |  8 ++++----
 .../8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml   | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 8b555d5f0..e69ee9c47 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8694,20 +8694,20 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
   scenarios:
     fixed-seq-len:
     - isl: 8192
-      osl: 1024
+      osl: 2048
       search-space:
       - conc-list: [256]
         spec-decoding: mtp
         prefill:
-          num-worker: 1
-          tp: 16
+          num-worker: 16
+          tp: 1
           ep: 16
           dp-attn: true
           additional-settings:
           - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
         decode:
           num-worker: 0
-          tp: 16
+          tp: 1
           ep: 1
           dp-attn: false
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
index 51c8774d3..e75634c9b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
@@ -65,11 +65,11 @@ backend:
       speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       tokenizer-mode: deepseek_v4
-      max-model-len: 9472
+      max-model-len: 10496
       max-num-seqs: 256
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 256
-      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -84,17 +84,17 @@ backend:
 profiling:
   type: "torch"
   aggregated:
-    start_step: 3
-    stop_step: 4
+    start_step: 2304
+    stop_step: 2305
 
 benchmark:
   type: "sa-bench"
   isl: 8192
-  osl: 1024
+  osl: 2048
   concurrencies: "256"
   req_rate: "inf"
   num_prompts_mult: 1
-  num_warmup_mult: 1
+  num_warmup_mult: 4
   use_chat_template: true
   custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
 

From 0dc030042ffaef4524208225cdfe6e481811d59e Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 12:22:40 -0700
Subject: [PATCH 32/37] fix(profile): switch GB200 profile to DSV4 Flash

---
 .github/configs/nvidia-master.yaml             |  6 +++---
 .github/workflows/profile.yml                  |  4 +++-
 ...b200-flash-profile-16gpu-conc256-mtp3.yaml} |  9 ++++-----
 runners/launch_gb200-nv.sh                     | 18 ++++++++++++++----
 4 files changed, 24 insertions(+), 13 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{agg-gb200-profile-16gpu-conc256-mtp3.yaml => agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml} (92%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e69ee9c47..c74fb1ff9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8682,9 +8682,9 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
 
 # Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
 # 256 shape: aggregated DEP16 on GB200, MTP3, conc=256.
-dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
+dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
   image: vllm/vllm-openai:v0.21.0-ubuntu2404
-  model: deepseek-ai/DeepSeek-V4-Pro
+  model: deepseek-ai/DeepSeek-V4-Flash
   model-prefix: dsv4
   runner: gb200
   precision: fp4
@@ -8704,7 +8704,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
           ep: 16
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml"
         decode:
           num-worker: 0
           tp: 1
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index 1ee135560..b545be9f1 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -355,6 +355,7 @@ jobs:
         id: push
         env:
           TRACE_LOCAL: ${{ steps.run.outputs.trace }}
+          REPO_PAT: ${{ secrets.REPO_PAT }}
         shell: bash
         run: |
           set -euo pipefail
@@ -366,9 +367,10 @@ jobs:
           pushd storage >/dev/null
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
+          git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git"
           git add -A
           git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
-          git push
+          git push origin HEAD:master
           STORAGE_SHA="$(git rev-parse HEAD)"
           popd >/dev/null
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml
similarity index 92%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml
index e75634c9b..afbeaa659 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml
@@ -1,7 +1,7 @@
-name: "svf-vllm-agg-gb200-profile-16gpu-conc256-mtp3"
+name: "svf-vllm-agg-gb200-flash-profile-16gpu-conc256-mtp3"
 
 model:
-  path: "deepseek-v4-pro"
+  path: "deepseek-v4-flash"
   container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
   precision: "fp4"
 
@@ -52,7 +52,7 @@ backend:
     NCCL_P2P_LEVEL: NVL
   vllm_config:
     aggregated:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
@@ -100,8 +100,7 @@ benchmark:
 
 identity:
   model:
-    repo: "deepseek-ai/DeepSeek-V4-Pro"
-    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+    repo: "deepseek-ai/DeepSeek-V4-Flash"
   container:
     image: "vllm/vllm-openai:v0.21.0-ubuntu2404"
   frameworks:
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index dada98bd6..3ccb772f0 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -19,8 +19,13 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
         # Same compute-node-local NVMe path as the dynamo-vllm dsv4
         # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX
         # matches the model.path alias in our DSV4 sglang recipes.
-        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
+        else
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        fi
     else
         export MODEL_PATH=$MODEL
     fi
@@ -52,8 +57,13 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
         # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
         # model.path alias in our DSV4 recipes.
-        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
+        else
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        fi
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1

From 16d77dc0f3d4321f75a2a902cf1e9af778419b30 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 15:15:10 -0700
Subject: [PATCH 33/37] fix(profile): point GB200 Flash to Lustre weights

---
 runners/launch_gb200-nv.sh | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 3ccb772f0..e12767899 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -16,11 +16,10 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/"
         export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Same compute-node-local NVMe path as the dynamo-vllm dsv4
-        # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX
-        # matches the model.path alias in our DSV4 sglang recipes.
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4
+        # sglang recipes.
         if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/"
+            export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/"
             export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
         else
             export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
@@ -54,11 +53,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
-        # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
-        # model.path alias in our DSV4 recipes.
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4
+        # recipes.
         if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
-            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-flash/"
+            export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/"
             export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
         else
             export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"

From 980316a71c2d24291c91462302490a329ac220d3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 18:19:16 -0700
Subject: [PATCH 34/37] fix(profile): use 4-GPU GB200 Flash profile

---
 .github/configs/nvidia-master.yaml                     | 10 +++++-----
 ... => agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml} |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml => agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml} (95%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c74fb1ff9..cb56f9a07 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8680,8 +8680,8 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           ep: 8
           dp-attn: true
 
-# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
-# 256 shape: aggregated DEP16 on GB200, MTP3, conc=256.
+# Dedicated profile point for a single GB200 node / global batch 256 shape:
+# aggregated DEP4 on GB200, MTP3, conc=256.
 dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
   image: vllm/vllm-openai:v0.21.0-ubuntu2404
   model: deepseek-ai/DeepSeek-V4-Flash
@@ -8699,12 +8699,12 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
       - conc-list: [256]
         spec-decoding: mtp
         prefill:
-          num-worker: 16
+          num-worker: 4
           tp: 1
-          ep: 16
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml"
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml"
         decode:
           num-worker: 0
           tp: 1
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
similarity index 95%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
index afbeaa659..e5c39f8af 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-16gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
@@ -1,4 +1,4 @@
-name: "svf-vllm-agg-gb200-flash-profile-16gpu-conc256-mtp3"
+name: "svf-vllm-agg-gb200-flash-profile-4gpu-conc256-mtp3"
 
 model:
   path: "deepseek-v4-flash"
@@ -21,9 +21,9 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  agg_nodes: 4
+  agg_nodes: 1
   agg_workers: 1
-  gpus_per_agg: 16
+  gpus_per_agg: 4
 
 frontend:
   type: dynamo
@@ -57,7 +57,7 @@ backend:
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-hybrid-lb: true
-      data-parallel-size: 16
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enable-ep-weight-filter: true

From 47d860d427dc918bc3a970b179c3cfe6dc048337 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 20:17:58 -0700
Subject: [PATCH 35/37] fix: avoid stale profile trace reuse

---
 .github/workflows/profile.yml | 74 +++++++++++++++++++++++++++--------
 runners/launch_gb200-nv.sh    |  1 +
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index b545be9f1..e2d08430f 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -228,6 +228,10 @@ jobs:
           export RESULT_FILENAME="${res_name}"
           echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
 
+          echo "Removing stale profile artifacts from previous runs"
+          rm -rf LOGS
+          rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz
+
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
           if [ ! -f "${res_name}.json" ]; then
@@ -246,26 +250,22 @@ jobs:
           from pathlib import Path
 
           root = Path("LOGS")
-          candidates = [
-              p for p in root.rglob("*")
-              if p.is_file() and (
-                  p.name.endswith(".trace.json")
-                  or p.name.endswith(".trace.json.gz")
-                  or p.name.endswith(".pt.trace.json")
-                  or p.name.endswith(".pt.trace.json.gz")
-                  or p.name.endswith(".json.gz")
-                  or p.name.endswith(".json")
-              )
-          ]
-          candidates = [
-              p for p in candidates
-              if not p.name.startswith("results_") and "profile_export" not in p.name
-          ]
+
+          def is_trace_candidate(path: Path) -> bool:
+              name = path.name
+              if name.startswith("results_") or "profile_export" in name:
+                  return False
+              if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")):
+                  return True
+              return "trace" in name and name.endswith((".json", ".json.gz"))
+
+          candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)]
           if candidates:
-              print(max(candidates, key=lambda p: p.stat().st_size))
+              print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size)))
           PY
           )"
             if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
+              echo "Selected profile trace candidate: $trace_candidate"
               if [[ "$trace_candidate" == *.gz ]]; then
                 cp "$trace_candidate" "$trace_path"
               else
@@ -275,6 +275,48 @@ jobs:
           fi
 
           if [ -f "$trace_path" ]; then
+            echo "Profile trace prepared: $trace_path"
+            ls -lh "$trace_path"
+            sha256sum "$trace_path"
+            python3 - "$trace_path" <<'PY'
+          import gzip
+          import os
+          import re
+          import sys
+
+          trace_path = sys.argv[1]
+          expected = set()
+          worker_gpus = []
+          for workers_key, tp_key in (
+              ("PREFILL_NUM_WORKERS", "PREFILL_TP"),
+              ("DECODE_NUM_WORKERS", "DECODE_TP"),
+          ):
+              workers = os.environ.get(workers_key)
+              tp = os.environ.get(tp_key)
+              if workers and workers.isdigit() and tp and tp.isdigit():
+                  gpus = int(workers) * int(tp)
+                  if gpus:
+                      expected.add(gpus)
+                      worker_gpus.append(gpus)
+          if len(worker_gpus) > 1:
+              expected.add(sum(worker_gpus))
+
+          opener = gzip.open if trace_path.endswith(".gz") else open
+          with opener(trace_path, "rt", errors="replace") as f:
+              prefix = f.read(1024 * 1024)
+
+          if '"traceEvents"' not in prefix:
+              raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start")
+
+          match = re.search(r'"world_size"\s*:\s*(\d+)', prefix)
+          if expected and match:
+              world_size = int(match.group(1))
+              if world_size not in expected:
+                  allowed = ", ".join(str(v) for v in sorted(expected))
+                  raise SystemExit(
+                      f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}"
+                  )
+          PY
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
               # Try to locate corresponding TP-0 traces produced by SGLang profiler
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index e12767899..ed4824ef5 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -304,6 +304,7 @@ echo "Collecting results..."
 
 if [ -d "$LOGS_DIR" ]; then
     echo "Found logs directory: $LOGS_DIR"
+    rm -rf "$GITHUB_WORKSPACE/LOGS"
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 else

From f44c2242809393717a4ca49020dc76af71626f35 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 21:36:30 -0700
Subject: [PATCH 36/37] fix: prioritize decode in gb200 flash profile

---
 .github/configs/nvidia-master.yaml                   |  2 +-
 .../agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml   | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index cb56f9a07..72a8ca70f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -8694,7 +8694,7 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
   scenarios:
     fixed-seq-len:
     - isl: 8192
-      osl: 2048
+      osl: 256
       search-space:
       - conc-list: [256]
         spec-decoding: mtp
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
index e5c39f8af..a3acb5407 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
@@ -65,11 +65,11 @@ backend:
       speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       tokenizer-mode: deepseek_v4
-      max-model-len: 10496
+      max-model-len: 8704
       max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 256
-      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1152,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -84,13 +84,13 @@ backend:
 profiling:
   type: "torch"
   aggregated:
-    start_step: 2304
-    stop_step: 2305
+    start_step: 1152
+    stop_step: 1153
 
 benchmark:
   type: "sa-bench"
   isl: 8192
-  osl: 2048
+  osl: 256
   concurrencies: "256"
   req_rate: "inf"
   num_prompts_mult: 1

From 4ae1fe68bb4e008576346bd71f86be8de07ad63d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 26 May 2026 22:57:50 -0700
Subject: [PATCH 37/37] fix: profile final gb200 flash decode step

---
 .../8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
index a3acb5407..00cf06b78 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
@@ -69,7 +69,7 @@ backend:
       max-num-seqs: 256
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 256
-      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1152,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1296,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -84,8 +84,8 @@ backend:
 profiling:
   type: "torch"
   aggregated:
-    start_step: 1152
-    stop_step: 1153
+    start_step: 1296
+    stop_step: 1297
 
 benchmark:
   type: "sa-bench"