SemiAnalysisAI · Oseltamivir · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
@@ -2038,6 +2038,77 @@ dsv4-fp4-b300-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
+# Targeted single-point Flash config for profile.yml. Keep the existing Pro
+# sweep entry above unchanged; this profile-only key reuses the same B300
+# SGLang launch path at the 1k1k, conc=64 point.
+dsv4-flash-fp4-b300-sglang:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
+# Targeted single-point Flash vLLM profile matching the SGLang profile point
+# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run.
+dsv4-flash-fp4-b300-vllm:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
+
+# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile
+# location. The shared launcher maps dp-attn=true to DP without TP, and selects
+# 3 speculative tokens for this model.
+dsv4-flash-fp4-b300-vllm-mtp:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp }
+
+# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
+# non-MTP Flash profile above. The shared SGLang MTP launcher selects the
+# Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
+dsv4-flash-fp4-b300-sglang-mtp:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
   # DP_ATTENTION:
@@ -8609,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           ep: 8
           dp-attn: true
 
+# Dedicated profile point for a single GB200 node / global batch 256 shape:
+# aggregated DEP4 on GB200, MTP3, conc=256.
+dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 256
+      search-space:
+      - conc-list: [256]
+        spec-decoding: mtp
+        prefill:
+          num-worker: 4
+          tp: 1
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 1
+          dp-attn: false
+
 dsv4-fp4-b300-dynamo-vllm:
   image: vllm/vllm-openai:v0.20.1
   model: deepseek-ai/DeepSeek-V4-Pro

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
@@ -112,7 +112,18 @@ jobs:
       TP: ${{ matrix.config.tp }}
       EP_SIZE: ${{ matrix.config.ep }}
       DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
-      CONC: ${{ matrix.config.conc }}
+      CONC: ${{ toJson(matrix.config.conc) }}
+      CONC_JSON: ${{ toJson(matrix.config.conc) }}
+      PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
+      PREFILL_TP: ${{ matrix.config.prefill.tp }}
+      PREFILL_EP: ${{ matrix.config.prefill.ep }}
+      PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }}
+      PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }}
+      DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }}
+      DECODE_TP: ${{ matrix.config.decode.tp }}
+      DECODE_EP: ${{ matrix.config.decode.ep }}
+      DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }}
+      DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }}
       SPEC_DECODING: ${{ matrix.config.spec-decoding }}
       DISAGG: ${{ matrix.config.disagg }}
       MOE_DEBUG: '0'
@@ -148,7 +159,7 @@ jobs:
           ref: ${{ inputs.ref || github.sha }}
           clean: false
 
-      - name: Launch + Profile (single-node sglang/vllm)
+      - name: Launch + Profile
         id: run
         env:
           RUNNER_NAME: ${{ runner.name }}
@@ -159,20 +170,153 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          ep_val="${EP_SIZE:-1}"
-          res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+
+          export_additional_settings() {
+            local settings_json="$1"
+            python3 - "$settings_json" <<'PY'
+          import json
+          import sys
+
+          raw = sys.argv[1]
+          if not raw or raw == "null":
+              raise SystemExit(0)
+          for item in json.loads(raw) or []:
+              print(item)
+          PY
+          }
+
+          normalize_conc() {
+            python3 - <<'PY'
+          import json
+          import os
+
+          raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]"
+          try:
+              value = json.loads(raw)
+          except json.JSONDecodeError:
+              value = raw
+          if isinstance(value, list):
+              print("x".join(str(v) for v in value))
+          else:
+              print(str(value))
+          PY
+          }
+
+          if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then
+            conc_val="$(normalize_conc)"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}"
+
+            echo "IS_MULTINODE=true" >> "$GITHUB_ENV"
+            echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV"
+            echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV"
+
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}")
+            while IFS= read -r setting; do
+              if [ -n "$setting" ]; then
+                export "$setting"
+              fi
+            done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}")
+          else
+            ep_val="${EP_SIZE:-1}"
+            res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
+          fi
+
           export RESULT_FILENAME="${res_name}"
           echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
 
+          echo "Removing stale profile artifacts from previous runs"
+          rm -rf LOGS
+          rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz
+
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
           if [ ! -f "${res_name}.json" ]; then
-            echo "Run failed: Benchmark result ${res_name}.json not found." >&2
-            exit 1
+            result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)"
+            if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then
+              cp "$result_candidate" "${res_name}.json"
+            else
+              echo "Run failed: Benchmark result ${res_name}.json not found." >&2
+              exit 1
+            fi
           fi
 
           trace_path="profile_${res_name}.trace.json.gz"
+          if [ ! -f "$trace_path" ] && [ -d LOGS ]; then
+            trace_candidate="$(python3 - <<'PY'
+          from pathlib import Path
+
+          root = Path("LOGS")
+
+          def is_trace_candidate(path: Path) -> bool:
+              name = path.name
+              if name.startswith("results_") or "profile_export" in name:
+                  return False
+              if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")):
+                  return True
+              return "trace" in name and name.endswith((".json", ".json.gz"))
+
+          candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)]
+          if candidates:
+              print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size)))
+          PY
+          )"
+            if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
+              echo "Selected profile trace candidate: $trace_candidate"
+              if [[ "$trace_candidate" == *.gz ]]; then
+                cp "$trace_candidate" "$trace_path"
+              else
+                gzip -c "$trace_candidate" > "$trace_path"
+              fi
+            fi
+          fi
+
           if [ -f "$trace_path" ]; then
+            echo "Profile trace prepared: $trace_path"
+            ls -lh "$trace_path"
+            sha256sum "$trace_path"
+            python3 - "$trace_path" <<'PY'
+          import gzip
+          import os
+          import re
+          import sys
+
+          trace_path = sys.argv[1]
+          expected = set()
+          worker_gpus = []
+          for workers_key, tp_key in (
+              ("PREFILL_NUM_WORKERS", "PREFILL_TP"),
+              ("DECODE_NUM_WORKERS", "DECODE_TP"),
+          ):
+              workers = os.environ.get(workers_key)
+              tp = os.environ.get(tp_key)
+              if workers and workers.isdigit() and tp and tp.isdigit():
+                  gpus = int(workers) * int(tp)
+                  if gpus:
+                      expected.add(gpus)
+                      worker_gpus.append(gpus)
+          if len(worker_gpus) > 1:
+              expected.add(sum(worker_gpus))
+
+          opener = gzip.open if trace_path.endswith(".gz") else open
+          with opener(trace_path, "rt", errors="replace") as f:
+              prefix = f.read(1024 * 1024)
+
+          if '"traceEvents"' not in prefix:
+              raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start")
+
+          match = re.search(r'"world_size"\s*:\s*(\d+)', prefix)
+          if expected and match:
+              world_size = int(match.group(1))
+              if world_size not in expected:
+                  allowed = ", ".join(str(v) for v in sorted(expected))
+                  raise SystemExit(
+                      f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}"
+                  )
+          PY
             echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
             if [ "${FRAMEWORK}" = "sglang" ]; then
               # Try to locate corresponding TP-0 traces produced by SGLang profiler
@@ -193,6 +337,11 @@ jobs:
             fi
           else
             echo "Profile trace not found: $trace_path" >&2
+            if [ -d LOGS ]; then
+              echo "LOGS profile candidates:" >&2
+              find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true
+            fi
+            exit 1
           fi
 
       - name: Process result (json -> agg)
@@ -206,7 +355,7 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: profile_${{ env.RESULT_FILENAME }}
-          path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
+          path: ${{ steps.run.outputs.trace }}
           if-no-files-found: ignore
 
       - name: Upload TP-0-DECODE trace as artifact
@@ -240,33 +389,35 @@ jobs:
           repository: SemiAnalysisAI/InferenceX-trace-storage
           path: storage
           ref: master
-          ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }}
+          token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
 
       - name: Push profile to storage repo
         if: ${{ steps.run.outputs.trace != '' }}
         id: push
         env:
           TRACE_LOCAL: ${{ steps.run.outputs.trace }}
+          REPO_PAT: ${{ secrets.REPO_PAT }}
         shell: bash
         run: |
           set -euo pipefail
 
-          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}"
           mkdir -p "$dest_dir"
           cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"
 
           pushd storage >/dev/null
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
+          git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git"
           git add -A
-          git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
-          git push
+          git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
+          git push origin HEAD:master
           STORAGE_SHA="$(git rev-parse HEAD)"
           popd >/dev/null
 
-          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
-          export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
+          export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz"
+          export TITLE="${RESULT_FILENAME}"
 
           enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
           enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"