diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4a683d372..72a8ca70f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2038,6 +2038,77 @@ dsv4-fp4-b300-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
+# Targeted single-point Flash config for profile.yml. Keep the existing Pro
+# sweep entry above unchanged; this profile-only key reuses the same B300
+# SGLang launch path at the 1k1k, conc=64 point.
+dsv4-flash-fp4-b300-sglang:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
+# Targeted single-point Flash vLLM profile matching the SGLang profile point
+# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run.
+dsv4-flash-fp4-b300-vllm:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
+# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile
+# location. The shared launcher maps dp-attn=true to DP without TP, and selects
+# 3 speculative tokens for this model.
+dsv4-flash-fp4-b300-vllm-mtp:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp }
+
+# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
+# non-MTP Flash profile above. The shared SGLang MTP launcher selects the
+# Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
+dsv4-flash-fp4-b300-sglang-mtp:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
   # DP_ATTENTION:
@@ -8609,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           ep: 8
           dp-attn: true
 
+# Dedicated profile point for a single GB200 node / global batch 256 shape:
+# aggregated DEP4 on GB200, MTP3, conc=256.
+dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 256
+      search-space:
+      - conc-list: [256]
+        spec-decoding: mtp
+        prefill:
+          num-worker: 4
+          tp: 1
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 1
+          dp-attn: false
+
 dsv4-fp4-b300-dynamo-vllm:
   image: vllm/vllm-openai:v0.20.1
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index 8152d47a5..e2d08430f 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -112,7 +112,18 @@ jobs:
       TP: ${{ matrix.config.tp }}
       EP_SIZE: ${{ matrix.config.ep }}
       DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
-      CONC: ${{ matrix.config.conc }}
+      CONC: ${{ toJson(matrix.config.conc) }}
+      CONC_JSON: ${{ toJson(matrix.config.conc) }}
+      PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
+      PREFILL_TP: ${{ matrix.config.prefill.tp }}
+      PREFILL_EP: ${{ matrix.config.prefill.ep }}
+      PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }}
+      PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }}
+      DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }}
+      DECODE_TP: ${{ matrix.config.decode.tp }}
+      DECODE_EP: ${{ matrix.config.decode.ep }}
+      DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }}
+      DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }}
       SPEC_DECODING: ${{ matrix.config.spec-decoding }}
       DISAGG: ${{ matrix.config.disagg }}
       MOE_DEBUG: '0'
@@ -148,7 +159,7 @@ jobs:
           ref: ${{ inputs.ref || github.sha }}
           clean: false
 
-      - name: Launch + Profile (single-node sglang/vllm)
+      - name: Launch + Profile
         id: run
         env:
           RUNNER_NAME: ${{ runner.name }}
@@ -159,20 +170,153 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          ep_val="${EP_SIZE:-1}"
-          res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+
+          export_additional_settings() {
+            local settings_json="$1"
+            python3 - "$settings_json" <<'PY'
+          import json
+          import sys
+
+          raw = sys.argv[1]
+          if not raw or raw == "null":
+              raise SystemExit(0)
+          for item in json.loads(raw) or []:
+              print(item)
+          PY
+          }
+
+          normalize_conc() {
+            python3 - <<'PY'
+          import json
+          import os
+
+          raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]"
+          try:
+              value = json.loads(raw)
+          except json.JSONDecodeError:
+              value = raw
+          if isinstance(value, list):
+              print("x".join(str(v) for v in value))
+          else:
+              print(str(value))
+          PY
+          }
+
+          if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then
+            conc_val="$(normalize_conc)"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}"
+
+            echo "IS_MULTINODE=true" >> "$GITHUB_ENV"
+            echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV"
+            echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV"
+
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}")
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}")
+          else
+            ep_val="${EP_SIZE:-1}"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+          fi
+
           export RESULT_FILENAME="${res_name}"
           echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
 
+          echo "Removing stale profile artifacts from previous runs"
+          rm -rf LOGS
+          rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz
+
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
           if [ ! -f "${res_name}.json" ]; then
-            echo "Run failed: Benchmark result ${res_name}.json not found." >&2
-            exit 1
+            result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)"
+            if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then
+              cp "$result_candidate" "${res_name}.json"
+            else
+              echo "Run failed: Benchmark result ${res_name}.json not found." >&2
+              exit 1
+            fi
           fi
 
           trace_path="profile_${res_name}.trace.json.gz"
+          if [ ! -f "$trace_path" ] && [ -d LOGS ]; then
+            trace_candidate="$(python3 - <<'PY'
+          from pathlib import Path
+
+          root = Path("LOGS")
+
+          def is_trace_candidate(path: Path) -> bool:
+              name = path.name
+              if name.startswith("results_") or "profile_export" in name:
+                  return False
+              if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")):
+                  return True
+              return "trace" in name and name.endswith((".json", ".json.gz"))
+
+          candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)]
+          if candidates:
+              print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size)))
+          PY
+          )"
+            if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
+              echo "Selected profile trace candidate: $trace_candidate"
+              if [[ "$trace_candidate" == *.gz ]]; then
+                cp "$trace_candidate" "$trace_path"
+              else
+                gzip -c "$trace_candidate" > "$trace_path"
+              fi
+            fi
+          fi
+
           if [ -f "$trace_path" ]; then
+            echo "Profile trace prepared: $trace_path"
+            ls -lh "$trace_path"
+            sha256sum "$trace_path"
+            python3 - "$trace_path" <<'PY'
+          import gzip
+          import os
+          import re
+          import sys
+
+          trace_path = sys.argv[1]
+          expected = set()
+          worker_gpus = []
+          for workers_key, tp_key in (
+              ("PREFILL_NUM_WORKERS", "PREFILL_TP"),
+              ("DECODE_NUM_WORKERS", "DECODE_TP"),
+          ):
+              workers = os.environ.get(workers_key)
+              tp = os.environ.get(tp_key)
+              if workers and workers.isdigit() and tp and tp.isdigit():
+                  gpus = int(workers) * int(tp)
+                  if gpus:
+                      expected.add(gpus)
+                      worker_gpus.append(gpus)
+          if len(worker_gpus) > 1:
+              expected.add(sum(worker_gpus))
+
+          opener = gzip.open if trace_path.endswith(".gz") else open
+          with opener(trace_path, "rt", errors="replace") as f:
+              prefix = f.read(1024 * 1024)
+
+          if '"traceEvents"' not in prefix:
+              raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start")
+
+          match = re.search(r'"world_size"\s*:\s*(\d+)', prefix)
+          if expected and match:
+              world_size = int(match.group(1))
+              if world_size not in expected:
+                  allowed = ", ".join(str(v) for v in sorted(expected))
+                  raise SystemExit(
+                      f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}"
+                  )
+          PY
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
               # Try to locate corresponding TP-0 traces produced by SGLang profiler
@@ -193,6 +337,11 @@ jobs:
             fi
           else
             echo "Profile trace not found: $trace_path" >&2
+            if [ -d LOGS ]; then
+              echo "LOGS profile candidates:" >&2
+              find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true
+            fi
+            exit 1
           fi
 
       - name: Process result (json -> agg)
@@ -206,7 +355,7 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}
-          path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
+          path: ${{ steps.run.outputs.trace }}
           if-no-files-found: ignore
 
       - name: Upload TP-0-DECODE trace as artifact
@@ -240,7 +389,7 @@ jobs:
           repository: SemiAnalysisAI/InferenceX-trace-storage
           path: storage
           ref: master
-          ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }}
+          token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
 
       - name: Push profile to storage repo
@@ -248,25 +397,27 @@ jobs:
         id: push
         env:
           TRACE_LOCAL: ${{ steps.run.outputs.trace }}
+          REPO_PAT: ${{ secrets.REPO_PAT }}
         shell: bash
         run: |
           set -euo pipefail
 
-          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}"
           mkdir -p "$dest_dir"
           cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"
 
           pushd storage >/dev/null
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
+          git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git"
           git add -A
-          git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
-          git push
+          git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
+          git push origin HEAD:master
           STORAGE_SHA="$(git rev-parse HEAD)"
           popd >/dev/null
 
-          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
-          export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz"
+          export TITLE="${RESULT_FILENAME}"
 
           enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
           enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cfd30cd04..e6980d0c1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -178,6 +178,7 @@ wait_for_server_ready() {
 #   --max-concurrency: Max concurrency
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
+#   --num-warmups: Optional warmup request count before benchmark/profile
 #   --use-chat-template: Optional flag to enable chat template
 #   --dsv4: Optional flag to use the DeepSeek-V4 chat template
 #           (encoding_dsv4.py) instead of the tokenizer's built-in jinja
@@ -204,6 +205,7 @@ run_benchmark_serving() {
     local result_filename=""
     local result_dir=""
     local workspace_dir=""
+    local num_warmups=""
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
@@ -259,6 +261,10 @@ run_benchmark_serving() {
                 workspace_dir="$2"
                 shift 2
                 ;;
+            --num-warmups)
+                num_warmups="$2"
+                shift 2
+                ;;
             --use-chat-template)
                 use_chat_template=true
                 shift
@@ -341,6 +347,10 @@ run_benchmark_serving() {
         num_prompts="$max_concurrency"
     fi
 
+    if [[ -z "$num_warmups" ]]; then
+        num_warmups="$((2 * max_concurrency))"
+    fi
+
     # Build benchmark command
     local benchmark_cmd=(
         python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py"
@@ -357,7 +367,7 @@ run_benchmark_serving() {
         --ignore-eos
         "${profile_flag[@]}"
         --save-result
-        --num-warmups "$((2 * max_concurrency))" \
+        --num-warmups "$num_warmups" \
         --percentile-metrics 'ttft,tpot,itl,e2el'
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
@@ -508,7 +518,7 @@ move_profile_trace_for_relay() {
         return 0
     fi
 
-    local dest_trace="/workspace/profile_${RESULT_FILENAME}.trace.json.gz"
+    local dest_trace="$PWD/profile_${RESULT_FILENAME}.trace.json.gz"
     if [[ "$trace_file" == *.gz ]]; then
         cp -f "$trace_file" "$dest_trace"
     else
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
new file mode 100644
index 000000000..00cf06b78
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml
@@ -0,0 +1,107 @@
+name: "svf-vllm-agg-gb200-flash-profile-4gpu-conc256-mtp3"
+
+model:
+  path: "deepseek-v4-flash"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_RPC_TIMEOUT: "1800000"
+    VLLM_TORCH_PROFILER_DIR: "/logs/profiles/agg"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      tokenizer-mode: deepseek_v4
+      max-model-len: 8704
+      max-num-seqs: 256
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 256
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1296,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+
+profiling:
+  type: "torch"
+  aggregated:
+    start_step: 1296
+    stop_step: 1297
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 256
+  concurrencies: "256"
+  req_rate: "inf"
+  num_prompts_mult: 1
+  num_warmup_mult: 4
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Flash"
+  container:
+    image: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 03102778d..0ce1f016f 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -77,11 +77,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    SPECULATIVE_NUM_STEPS=1
+    SPECULATIVE_NUM_DRAFT_TOKENS=2
+    if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+        SPECULATIVE_NUM_STEPS=3
+        SPECULATIVE_NUM_DRAFT_TOKENS=3
+    fi
     SPEC_FLAGS=(
         --speculative-algorithm EAGLE
-        --speculative-num-steps 1
+        --speculative-num-steps "$SPECULATIVE_NUM_STEPS"
         --speculative-eagle-topk 1
-        --speculative-num-draft-tokens 2
+        --speculative-num-draft-tokens "$SPECULATIVE_NUM_DRAFT_TOKENS"
     )
     PARALLEL_ARGS=(
         --dp-size "$TP"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 92d4bf4ad..8bf458ae4 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -47,6 +47,14 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
 fi
 
+PROFILE_ARGS=()
+if [[ "${PROFILE:-}" == "1" ]]; then
+    PROFILE_ARGS=(
+        --profiler-config
+        "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+    )
+fi
+
 if [ "${DP_ATTENTION}" = "true" ]; then
     MAX_NUM_BATCHED_TOKENS=2048
 else
@@ -76,6 +84,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --no-enable-prefix-caching \
     "${EP_ARGS[@]}" \
     "${MOE_ARGS[@]}" \
+    "${PROFILE_ARGS[@]}" \
     --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index cb41a9eb1..efda4024d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -44,7 +44,41 @@ else
     MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
 fi
 
+PROFILE_ARGS=()
+if [[ "${PROFILE:-}" == "1" ]]; then
+    PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}"
+    if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+        PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":3,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}"
+    fi
+    PROFILE_ARGS=(
+        --profiler-config
+        "$PROFILER_CONFIG"
+    )
+fi
+
+COMPILATION_ARGS=(
+    --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+    --max-cudagraph-capture-size 2048
+)
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    COMPILATION_ARGS=(
+        --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+        --max-cudagraph-capture-size 2048
+    )
+fi
+
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
+BENCHMARK_OUTPUT_LEN=$OSL
+BENCHMARK_NUM_PROMPTS=$((CONC * 10))
+BENCHMARK_MAX_CONCURRENCY=$CONC
+BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY))
+
+if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    BENCHMARK_OUTPUT_LEN=3
+    BENCHMARK_NUM_PROMPTS=256
+    BENCHMARK_MAX_CONCURRENCY=256
+    BENCHMARK_NUM_WARMUPS=4096
+fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
@@ -54,8 +88,12 @@ else
     SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
 fi
 
-# use 2 speculative tokens for all configs for now
+# Keep the existing Pro MTP profile at 2 speculative tokens; Flash uses the
+# requested 3-token MTP profile.
 NUM_SPEC_TOKENS=2
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    NUM_SPEC_TOKENS=3
+fi
 
 start_gpu_monitor
 
@@ -69,13 +107,13 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --no-enable-prefix-caching \
     "${EP_ARGS[@]}" \
     "${MOE_ARGS[@]}" \
-    --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
+    "${PROFILE_ARGS[@]}" \
+    "${COMPILATION_ARGS[@]}" \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 \
-    --max-cudagraph-capture-size 2048 \
     --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &
@@ -93,10 +131,11 @@ run_benchmark_serving \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
-    --output-len "$OSL" \
+    --output-len "$BENCHMARK_OUTPUT_LEN" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 10))" \
-    --max-concurrency "$CONC" \
+    --num-prompts "$BENCHMARK_NUM_PROMPTS" \
+    --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \
+    --num-warmups "$BENCHMARK_NUM_WARMUPS" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/ \
     --trust-remote-code \
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index cca8b4ab0..fcc630db9 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -287,7 +287,7 @@ else
     HF_HUB_CACHE_MOUNT="/data/models"
     if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
         export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
-    elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
+    elif [[ "$MODEL_PREFIX" == "dsv4" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
         export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index dada98bd6..ed4824ef5 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -16,11 +16,15 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/"
         export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Same compute-node-local NVMe path as the dynamo-vllm dsv4
-        # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX
-        # matches the model.path alias in our DSV4 sglang recipes.
-        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4
+        # sglang recipes.
+        if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+            export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
+        else
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        fi
     else
         export MODEL_PATH=$MODEL
     fi
@@ -49,11 +53,15 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
-        # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
-        # model.path alias in our DSV4 recipes.
-        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4
+        # recipes.
+        if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+            export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
+        else
+            export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+            export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        fi
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
@@ -296,6 +304,7 @@ echo "Collecting results..."
 
 if [ -d "$LOGS_DIR" ]; then
     echo "Found logs directory: $LOGS_DIR"
+    rm -rf "$GITHUB_WORKSPACE/LOGS"
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 else
diff --git a/utils/process_result.py b/utils/process_result.py
index 4603287bc..2010c09ff 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -1,5 +1,6 @@
 import sys
 import json
+import math
 import os
 from pathlib import Path
 
@@ -128,8 +129,9 @@ def get_required_env_vars(required_vars):
     if key.endswith('ms'):
         data[key.replace('_ms', '')] = float(value) / 1000.0
     if 'tpot' in key:
+        tpot_ms = float(value)
         data[key.replace('_ms', '').replace(
-            'tpot', 'intvty')] = 1000.0 / float(value)
+            'tpot', 'intvty')] = 1000.0 / tpot_ms if math.isfinite(tpot_ms) and tpot_ms > 0 else 0.0
 
 print(json.dumps(data, indent=2))
 
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index e3903c6e6..edeba20be 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -342,6 +342,22 @@ def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars):
         assert output_data["intvty_p50"] == pytest.approx(50.0)
         assert output_data["intvty_p99"] == pytest.approx(20.0)
 
+    def test_zero_tpot_interactivity_is_guarded(self, tmp_path, single_node_env_vars):
+        """Test that zero TPOT fields do not crash interactivity conversion."""
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 1,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 800.0,
+            "mean_tpot_ms": 0.0,
+        }
+
+        result = run_script(tmp_path, single_node_env_vars, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        output_data = json.loads(result.stdout)
+        assert output_data["mean_intvty"] == pytest.approx(0.0)
+
     def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars):
         """Test throughput per GPU calculation for single node."""
         benchmark_result = {