From 9cc6d50d23ac7d346918390592b6f9d919508f66 Mon Sep 17 00:00:00 2001
From: "Albert Cheng (Engrg-Hardware 1)"
 <albecheng@login-lyris01.lyris.clusters.nvidia.com>
Date: Wed, 1 Apr 2026 13:35:46 -0700
Subject: [PATCH 01/14] Make Dynamo source install container-agnostic (vLLM,
 SGLang, etc.)

Auto-detect container type at runtime: if /sgl-workspace exists (SGLang),
use original install path unchanged; otherwise use portable /tmp build path
with conditional dependency installation for non-SGLang containers.
---
 src/srtctl/core/schema.py | 32 ++++++++++++++++++++++++++++++--
 tests/test_configs.py     | 10 +++++++++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index 97547fec..085db6c8 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -719,8 +719,8 @@ def get_install_commands(self) -> str:
         git_ref = self.hash if self.hash else "HEAD"
         checkout_cmd = f"git checkout {self.hash}" if self.hash else ""
 
-        return (
-            f"echo 'Installing dynamo from source ({git_ref})...' && "
+        # Original SGLang container path, UNCHANGED
+        sglang = (
             "apt-get update -qq && apt-get install -y -qq libclang-dev > /dev/null 2>&1 && "
             "cd /sgl-workspace/ && "
             "git clone https://github.com/ai-dynamo/dynamo.git && "
@@ -736,6 +736,34 @@ def get_install_commands(self) -> str:
             f"echo 'Dynamo installed from source ({git_ref})'"
         )
 
+        # Portable path for non-SGLang containers (vLLM, etc.)
+        portable = (
+            "if ! command -v cargo &> /dev/null || ! command -v maturin &> /dev/null; then "
+            "apt-get update -qq && apt-get install -y -qq git curl libclang-dev protobuf-compiler > /dev/null 2>&1 && "
+            "if ! command -v cargo &> /dev/null; then "
+            "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && source $HOME/.cargo/env; fi && "
+            "if ! command -v maturin &> /dev/null; then "
+            "pip install --break-system-packages maturin; fi; fi && "
+            "ORIG_DIR=$(pwd) && rm -rf /tmp/dynamo_build && mkdir -p /tmp/dynamo_build && cd /tmp/dynamo_build && "
+            "git clone https://github.com/ai-dynamo/dynamo.git && "
+            "cd dynamo && "
+            f"{checkout_cmd + ' && ' if checkout_cmd else ''}"
+            "cd lib/bindings/python/ && "
+            'export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" && '
+            "rm -f /tmp/ai_dynamo_runtime*.whl && "
+            "maturin build -o /tmp && "
+            "pip install --break-system-packages /tmp/ai_dynamo_runtime*.whl --force-reinstall && "
+            "cd /tmp/dynamo_build/dynamo/ && "
+            "pip install --break-system-packages -e . && "
+            "cd $ORIG_DIR && "
+            f"echo 'Dynamo installed from source ({git_ref})'"
+        )
+
+        return (
+            f"echo 'Installing dynamo from source ({git_ref})...' && "
+            f"if [ -d /sgl-workspace ]; then {sglang}; else {portable}; fi"
+        )
+
     Schema: ClassVar[type[Schema]] = Schema
 
 
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 1c23fb30..b1ef1736 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -127,7 +127,11 @@ def test_hash_install_command(self):
         assert "git clone" in cmd
         assert "git checkout abc123" in cmd
         assert "maturin build" in cmd
-        assert "pip install -e" in cmd
+        assert "if [ -d /sgl-workspace ]" in cmd
+        assert "/tmp/dynamo_build" in cmd
+        assert "protobuf-compiler" in cmd
+        assert "if ! command -v cargo" in cmd
+        assert "if ! command -v maturin" in cmd
 
     def test_top_of_tree_install_command(self):
         """Top-of-tree config generates source install without checkout."""
@@ -140,6 +144,10 @@ def test_top_of_tree_install_command(self):
         assert "git clone" in cmd
         assert "git checkout" not in cmd
         assert "maturin build" in cmd
+        assert "if [ -d /sgl-workspace ]" in cmd
+        assert "/tmp/dynamo_build" in cmd
+        assert "--break-system-packages" in cmd
+        assert "--force-reinstall" in cmd
 
     def test_hash_and_top_of_tree_not_allowed(self):
         """Cannot specify both hash and top_of_tree."""

From 8294e64ee2eefa075c2502a62e19e8cd8e6ca23a Mon Sep 17 00:00:00 2001
From: nlevin-ui <nlevin@nvidia.com>
Date: Mon, 6 Apr 2026 17:19:27 -0600
Subject: [PATCH 02/14] Add Kimi-K2.5 vLLM recipes and fix NIXL side channel
 host (#11)

* Add Kimi-K2.5 vLLM recipes and fix NIXL side channel host

- Add kimi-k2.5 1k1k and 8k1k disagg GB200 recipes (from NVIDIA/srt-slurm#7)
- Fix vLLM NIXL handshake failures: set VLLM_NIXL_SIDE_CHANNEL_HOST to
  node's routable IP in get_process_environment() instead of leaving it
  as 0.0.0.0/localhost which caused transfer handshake failures
- Update test_vllm_get_process_environment to cover NIXL host env var

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ci: run checks on PRs targeting sa-submission-q2-2026

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yaml                     |   3 +-
 .../1k1k/disagg-gb200-1p1d-dep4-dep16.yaml    | 101 ++++++++++++++++++
 .../1k1k/disagg-gb200-1p4d-dep4-tep4.yaml     |  98 +++++++++++++++++
 .../8k1k/disagg-gb200-1p4d-dep4-tep4.yaml     |  98 +++++++++++++++++
 .../8k1k/disagg-gb200-3p1d-dep4-dep16.yaml    | 101 ++++++++++++++++++
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     | 101 ++++++++++++++++++
 .../8k1k/disagg-gb200-6p1d-dep4-dep16.yaml    | 101 ++++++++++++++++++
 src/srtctl/backends/vllm.py                   |   4 +
 tests/test_configs.py                         |   6 +-
 9 files changed, 611 insertions(+), 2 deletions(-)
 create mode 100644 recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
 create mode 100644 recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
 create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
 create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
 create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
 create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index eba897bb..dccdba05 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [main, master]
   pull_request:
-    branches: [main, master]
+    branches: [main, master, sa-submission-q2-2026]
 
 jobs:
   lint:
@@ -119,3 +119,4 @@ jobs:
               exit(1)
           print(f'\nAll {len(recipes)} recipes valid')
           "
+
diff --git a/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
new file mode 100644
index 00000000..ecdc9233
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-1p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 4096
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 4096
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024x2048x3072x4096"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
new file mode 100644
index 00000000..43167b5f
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -0,0 +1,98 @@
+name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 1024
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
new file mode 100644
index 00000000..1ab6ca27
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -0,0 +1,98 @@
+name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 16
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 16
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x128"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
new file mode 100644
index 00000000..ca4e9813
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 3
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 256
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
new file mode 100644
index 00000000..cd9f94a9
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
new file mode 100644
index 00000000..47d3d7ee
--- /dev/null
+++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-6p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "3072x4096"
+  req_rate: "inf"
diff --git a/src/srtctl/backends/vllm.py b/src/srtctl/backends/vllm.py
index ff20cb40..1acbd50c 100644
--- a/src/srtctl/backends/vllm.py
+++ b/src/srtctl/backends/vllm.py
@@ -132,12 +132,16 @@ def get_process_environment(self, process: Process) -> dict[str, str]:
         vLLM with dynamo requires unique ports for each worker:
         - DYN_VLLM_KV_EVENT_PORT: ZMQ port for KV events publishing
         - VLLM_NIXL_SIDE_CHANNEL_PORT: Port for NIXL side channel transfers
+        - VLLM_NIXL_SIDE_CHANNEL_HOST: Routable IP for NIXL side channel (not 0.0.0.0/localhost)
         """
+        from srtctl.core.slurm import get_hostname_ip
+
         env: dict[str, str] = {}
         if process.kv_events_port is not None:
             env["DYN_VLLM_KV_EVENT_PORT"] = str(process.kv_events_port)
         if process.nixl_port is not None:
             env["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(process.nixl_port)
+            env["VLLM_NIXL_SIDE_CHANNEL_HOST"] = get_hostname_ip(process.node)
         return env
 
     def get_served_model_name(self, default: str) -> str:
diff --git a/tests/test_configs.py b/tests/test_configs.py
index b1ef1736..86d79cdb 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -1080,6 +1080,8 @@ def test_standard_tp_mode_still_works(self):
 
     def test_vllm_get_process_environment(self):
         """Test vLLM sets port environment variables from process."""
+        from unittest.mock import patch
+
         from srtctl.backends import VLLMProtocol
         from srtctl.core.topology import Process
 
@@ -1098,10 +1100,12 @@ def test_vllm_get_process_environment(self):
             nixl_port=6550,
         )
 
-        env = backend.get_process_environment(process)
+        with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"):
+            env = backend.get_process_environment(process)
 
         assert env["DYN_VLLM_KV_EVENT_PORT"] == "5550"
         assert env["VLLM_NIXL_SIDE_CHANNEL_PORT"] == "6550"
+        assert env["VLLM_NIXL_SIDE_CHANNEL_HOST"] == "10.0.0.1"
 
     def test_vllm_get_process_environment_none_ports(self):
         """Test vLLM handles None ports gracefully."""

From 94903bdb6352b048305b407d28fea7e0f4ae2f65 Mon Sep 17 00:00:00 2001
From: Yeswanth koti <yeswanthk@nvidia.com>
Date: Fri, 10 Apr 2026 01:54:10 -0400
Subject: [PATCH 03/14] =?UTF-8?q?Add=20Kimi=20K2.5=20disagg=20STP=20and=20?=
 =?UTF-8?q?MTP=20recipes=20for=20GB200=20NVfp4=20(ISL8K=5FOSL1K=E2=80=A6?=
 =?UTF-8?q?=20(#24)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add Kimi K2.5 disagg STP and MTP recipes for GB200 NVfp4 (ISL8K_OSL1K and ISL1K_OSL1K)

Add optimized disaggregated inference recipes for Kimi K2.5 model with NVfp4
precision on GB200 GPUs. Includes both STP and MTP configurations for
ISL8K_OSL1K and ISL1K_OSL1K workloads covering concurrency points from 5
to 2253, with Eagle speculative decoding for MTP variants.

* Update Kimi K2.5 recipes: container, model path, concurrency format, and env cleanup

- Update container to tensorrtllm-runtime-1.1.0-dev.2.sqsh
- Point model path to shared /mnt/lustre01/models/kimi-k2.5-nvfp4
- Update Eagle model mount path for MTP configs
- Remove HF_HOME (defaults to ~/.cache/huggingface)
- Fix concurrency separator from space to 'x' for sa-bench compatibility
- Enable multiple frontends for ctx1dep4_gen1dep32_batch64

* Use generic model path and container aliases for cluster portability

Replace cluster-specific paths with generic alias names that are resolved
via srtslurm.yaml model_paths and containers mappings, as per upstream convention.

* Add extra_mount alias resolution and use generic Eagle model path

Add model_paths alias resolution for extra_mount host paths in config.py,
enabling MTP recipes to use generic name "kimi-k2.5-eagle3" instead of
cluster-specific path for the Eagle speculative decoding model.

* Use HuggingFace model names and full NVCR container paths

Per review feedback, update model paths to HuggingFace format
(nvidia/Kimi-K2.5-NVFP4) and container to full NVCR registry path
(nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2) so recipes
are portable and work without pre-built sqsh files.

---------

Co-authored-by: nlevin-ui <nlevin@nvidia.com>
---
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml | 136 +++++++++++
 ...ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml | 134 +++++++++++
 ...ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml | 196 ++++++++++++++++
 ...4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml | 141 ++++++++++++
 ...p4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml | 132 +++++++++++
 ...tx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml | 148 ++++++++++++
 ...ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml | 140 +++++++++++
 ...ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml | 164 +++++++++++++
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 125 ++++++++++
 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 129 +++++++++++
 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 217 ++++++++++++++++++
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 138 +++++++++++
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 122 ++++++++++
 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 153 ++++++++++++
 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 137 +++++++++++
 .../ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml | 137 +++++++++++
 .../ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml  | 133 +++++++++++
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml | 133 +++++++++++
 .../ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml | 133 +++++++++++
 ...ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml | 134 +++++++++++
 ..._gen1dep8_batch256_allconc_eplb0_mtp1.yaml | 164 +++++++++++++
 ...ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml | 136 +++++++++++
 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 126 ++++++++++
 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 123 ++++++++++
 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 126 ++++++++++
 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 124 ++++++++++
 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 126 ++++++++++
 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 155 +++++++++++++
 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 138 +++++++++++
 src/srtctl/core/config.py                     |  14 ++
 30 files changed, 4114 insertions(+)
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml

diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..03462b07
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,136 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# concurrency: 666
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..6a29059c
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,134 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# concurrency: 666
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml
new file mode 100644
index 00000000..739bd487
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml
@@ -0,0 +1,196 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch512_eplb0_mtp1"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=512
+# MTP (Eagle speculative decoding, max_draft_len=1)
+# concurrency: 4301
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..a768bec4
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,141 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=64
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# Covers all gen4tep8 concurrencies: 8, 48, 92, 192, 336
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8x48x92x192x336"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..c2e24b41
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,132 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=2
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# Covers all gen5tep4 concurrencies: 10, 15
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 8
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10x15"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml
new file mode 100644
index 00000000..68d7dd06
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml
@@ -0,0 +1,148 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp1"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# MTP (Eagle speculative decoding, max_draft_len=1)
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..1cb17478
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,140 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp1"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
+# MTP (Eagle speculative decoding, max_draft_len=1)
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml
new file mode 100644
index 00000000..eb43aab7
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml
@@ -0,0 +1,164 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen3dep8_batch256_eplb0_mtp1"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 3 decode workers, TP8/EP8, enable_attention_dp=true, max_batch=256
+# MTP (Eagle speculative decoding, max_draft_len=1)
+# concurrency: 6759
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6759"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..ce3eff43
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,125 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# STP (no speculative decoding)
+# concurrency: 666
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..105b84bf
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch64_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
+# STP (no speculative decoding)
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..9fb194dd
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,217 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=768
+# STP (no speculative decoding)
+# Covers all dep8 concurrencies: 4301, 6452
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 768
+      max_num_tokens: 768
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+          - 520
+          - 528
+          - 536
+          - 544
+          - 552
+          - 560
+          - 568
+          - 576
+          - 584
+          - 592
+          - 600
+          - 608
+          - 616
+          - 624
+          - 632
+          - 640
+          - 648
+          - 656
+          - 664
+          - 672
+          - 680
+          - 688
+          - 696
+          - 704
+          - 712
+          - 720
+          - 728
+          - 736
+          - 744
+          - 752
+          - 760
+          - 768
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301x6452"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..5639da41
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,138 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=128
+# STP (no speculative decoding)
+# Covers all gen4tep8 concurrencies: 4, 192, 360, 668
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x192x360x668"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..f9496feb
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,122 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=8
+# STP (no speculative decoding)
+# Covers all gen5tep4 concurrencies: 5, 15, 30, 55
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5x15x30x55"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
new file mode 100644
index 00000000..71b016c4
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,153 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256
+# STP (no speculative decoding)
+# concurrency: 4301
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..52b75bb4
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,137 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch128_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
+# STP (no speculative decoding)
+# concurrency: 4301
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..bb3f8d1e
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,137 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 2 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=32
+# MTP Eagle speculative decoding, max_draft_len=3
+# concurrency: 90
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "90"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..8b7f02d6
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1
+# MTP Eagle speculative decoding, max_draft_len=3
+# concurrency: 8
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..1883e739
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=8
+# MTP Eagle speculative decoding, max_draft_len=3
+# Covers all gen5tep4 concurrencies: 10, 15, 60
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "10x15x60"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..5aced422
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch8_eplb0_mtp3"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=8
+# MTP Eagle speculative decoding, max_draft_len=3
+# concurrency: 180
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..764f2d46
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,134 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp3"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# MTP Eagle speculative decoding, max_draft_len=3
+# concurrency: 666
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml
new file mode 100644
index 00000000..31308fe6
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml
@@ -0,0 +1,164 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256
+# MTP Eagle speculative decoding, max_draft_len=1
+# Covers all dep8 mtp1 concurrencies: 1229, 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 1
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229x2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..9bd03c05
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,136 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp3"
+
+# ctx: 8 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# MTP Eagle speculative decoding, max_draft_len=3
+# concurrency: 1229
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 8
+  prefill_workers: 8
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..8c1f0aa8
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP4/EP4, max_batch=32
+# Single concurrency point: 156
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 4 workers x TP4 = 16 GPUs = 4 nodes
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "156"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..d4c5086b
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,123 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1
+# Single concurrency point: 4
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 4 workers x TP8 = 32 GPUs = 8 nodes
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..8f6ea063
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=16
+# Covers all concurrencies: 5, 15, 30, 60, 105
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 5 workers x TP4 = 20 GPUs = 5 nodes
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      # max_batch_size=16 covers all concs: 5, 15, 30, 60, 105
+      # cuda_graph pre-compiles graphs for each batch size up to the max
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5x15x30x60x105"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..4bfaa0e2
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,124 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch16_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=16
+# concurrency: 333
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 2 workers x TP4 = 8 GPUs = 2 nodes
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..d7d51627
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep16_batch32_eplb0_mtp0"
+
+# ctx: 3 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 615
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 3 workers x TP4 = 12 GPUs = 3 nodes
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..e8df1179
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,155 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256
+# Single concurrency point: 2151
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 5 workers x TP4 = 20 GPUs = 5 nodes
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP8 = 8 GPUs = 2 nodes
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      # max_batch_size=256, cuda_graph pre-compiles graphs for all batch sizes up to 256
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2151"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..db177892
--- /dev/null
+++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,138 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch128_eplb0_mtp0"
+
+# ctx: 7 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 7 workers x TP4 = 28 GPUs = 7 nodes
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/src/srtctl/core/config.py b/src/srtctl/core/config.py
index 8cea4e17..f30fc7fc 100644
--- a/src/srtctl/core/config.py
+++ b/src/srtctl/core/config.py
@@ -141,6 +141,20 @@ def resolve_config_with_defaults(user_config: dict[str, Any], cluster_config: di
         config["reporting"] = cluster_config["reporting"]
         logger.debug("Applied cluster reporting config")
 
+    # Resolve extra_mount host path aliases through model_paths
+    extra_mounts = config.get("extra_mount", [])
+    if model_paths and extra_mounts:
+        resolved_mounts = []
+        for mount_spec in extra_mounts:
+            host_path, container_path = mount_spec.split(":", 1)
+            if host_path in model_paths:
+                resolved_host = model_paths[host_path]
+                resolved_mounts.append(f"{resolved_host}:{container_path}")
+                logger.debug(f"Resolved extra_mount alias '{host_path}' -> '{resolved_host}'")
+            else:
+                resolved_mounts.append(mount_spec)
+        config["extra_mount"] = resolved_mounts
+
     # Resolve frontend nginx_container alias
     frontend = config.get("frontend", {})
     nginx_container = frontend.get("nginx_container", "")

From b0f5b83f1949bc5d11059704ae84e263d89424d1 Mon Sep 17 00:00:00 2001
From: Camilo Moreno <camilom@nvidia.com>
Date: Mon, 13 Apr 2026 11:20:29 -0700
Subject: [PATCH 04/14] Add Minimax M2.5 NVFP4 agg B200 single-node configs
 (#36)

* recipes for minimax m2.5 fp4 b200 agg vllm

* commit for signature
---
 recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml | 103 +++++++++++++++++++
 recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml |  88 ++++++++++++++++
 2 files changed, 191 insertions(+)
 create mode 100644 recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml
 create mode 100644 recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml

diff --git a/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml b/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml
new file mode 100644
index 00000000..daef7b0d
--- /dev/null
+++ b/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml
@@ -0,0 +1,103 @@
+# MiniMax-M2.5 NVFP4 B200 — 1K/1K ISL/OSL
+# Aggregated vLLM, single-node 
+# requires github.com/NVIDIA/srt-slurm, branch sa-submission-q2-2026
+# usage examples: 
+# srtctl apply -f 1k1k.yaml     # run all variants
+# srtctl apply -f 1k1k.yaml:zip_override_lowlat    # full lowlat sweep
+# srtctl apply -f 1k1k.yaml:zip_override_lowlat[2] # lowlat, tep2 variant only
+# srtctl apply -f 1k1k.yaml:zip_override_hightput   # full high tput sweep
+# srtctl dry-run -f 1k1k.yaml   # preview the variants
+
+base:
+  name: "minimax-m2.5-nvfp4-b200-1k1k"
+
+  model:
+    path: "minimax_m2.5_fp4"
+    container: "vllm/vllm-openai:v0.19.0-cu130"
+    precision: "fp4"
+
+  resources:
+    gpu_type: "b200"
+    gpus_per_node: 8
+    agg_nodes: 1
+    agg_workers: 1
+    gpus_per_agg: 1
+
+  frontend:
+    type: dynamo
+    enable_multiple_frontends: false
+
+  dynamo:
+    install: true
+    top_of_tree: true # currently need ToT for vllm 0.19.0
+
+  setup_script: vllm-container-deps.sh
+
+  backend:
+    type: vllm
+
+    aggregated_environment:
+      DYN_HEALTH_CHECK_ENABLED: "false"
+      PYTHONUNBUFFERED: "1"
+
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: 1
+        gpu-memory-utilization: 0.90
+        max-model-len: 2248
+        max-num-batched-tokens: 2048
+        kv-cache-dtype: fp8
+        max-cudagraph-capture-size: 2048
+        stream-interval: 20
+        no-enable-prefix-caching: true
+        trust-remote-code: true
+
+  benchmark:
+    type: "sa-bench"
+    isl: 1024
+    osl: 1024
+    req_rate: "inf"
+
+
+zip_override_lowlat:
+  name:
+    - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tp1"
+    - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tp2"
+    - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tep2"
+  resources:
+    gpus_per_agg: [1, 2, 2]
+  backend:
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: [1, 2, 2]
+        enable-expert-parallel: [false, false, true]
+  benchmark:
+    concurrencies: ["4","4x8x16x32x64x128x256x512","128x256"]
+
+override_maxtput:
+  name: "minimax-m2.5-nvfp4-b200-1k1k-maxtput-dep2"
+  resources:
+    gpus_per_agg: 2
+  backend:
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: 1
+        enable-expert-parallel: true
+        data-parallel-size: 2
+  benchmark:
+    concurrencies: "512"
+
+zip_override_hightput:
+  name:
+    - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tp4"
+    - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tep4"
+    - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tp8"
+  resources:
+    gpus_per_agg: [4, 4, 8]
+  backend:
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: [4, 4, 8]
+        enable-expert-parallel: [false, true, false]
+  benchmark:
+    concurrencies: ["4x8x16x32x64x128x256x512", "32x64x128", "4"]
diff --git a/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml b/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml
new file mode 100644
index 00000000..7d817e73
--- /dev/null
+++ b/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml
@@ -0,0 +1,88 @@
+# MiniMax-M2.5 NVFP4 B200 — 8K/1K ISL/OSL
+# Aggregated vLLM, single-node 
+# requires github.com/NVIDIA/srt-slurm, branch sa-submission-q2-2026
+# usage examples: 
+# srtctl apply -f 8k1k.yaml     # run all variants
+# srtctl apply -f 8k1k.yaml:zip_override_lowlat    # full lowlat sweep
+# srtctl apply -f 8k1k.yaml:zip_override_lowlat[2] # lowlat, tep2 variant only
+# srtctl apply -f 8k1k.yaml:zip_override_maxtput   # full max tput sweep
+# srtctl dry-run -f 8k1k.yaml   # preview the variants
+
+base:
+  name: "minimax-m2.5-nvfp4-b200-8k1k"
+
+  model:
+    path: "minimax_m2.5_fp4"
+    container: "vllm/vllm-openai:v0.19.0-cu130"
+    precision: "fp4"
+
+  resources:
+    gpu_type: "b200"
+    gpus_per_node: 8
+    agg_nodes: 1
+    agg_workers: 1
+    gpus_per_agg: 1
+
+  frontend:
+    type: dynamo
+    enable_multiple_frontends: false
+
+  dynamo:
+    install: true
+    top_of_tree: true # currently need ToT for vllm 0.19.0
+
+  setup_script: vllm-container-deps.sh
+
+  backend:
+    type: vllm
+
+    aggregated_environment:
+      DYN_HEALTH_CHECK_ENABLED: "false"
+      PYTHONUNBUFFERED: "1"
+
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: 1
+        gpu-memory-utilization: 0.90
+        max-model-len: 9416
+        max-num-batched-tokens: 16384
+        kv-cache-dtype: fp8
+        max-cudagraph-capture-size: 2048
+        stream-interval: 20
+        no-enable-prefix-caching: true
+        trust-remote-code: true
+
+  benchmark:
+    type: "sa-bench"
+    isl: 8192
+    osl: 1024
+    req_rate: "inf"
+
+zip_override_lowlat:
+  name:
+    - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tp1"
+    - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tp2"
+    - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tep2"
+  resources:
+    gpus_per_agg: [1, 2, 2]
+  backend:
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: [1, 2, 2]
+        enable-expert-parallel: [false, false, true]
+  benchmark:
+    concurrencies: ["4x8x16x32x256x512", "4x8x16x32x64x128x256x512", "128x256x512"]
+
+zip_override_maxtput:
+  name:
+    - "minimax-m2.5-nvfp4-b200-8k1k-maxtput-tp4"
+    - "minimax-m2.5-nvfp4-b200-8k1k-maxtput-tp8"
+  resources:
+    gpus_per_agg: [4, 8]
+  backend:
+    vllm_config:
+      aggregated:
+        tensor-parallel-size: [4, 8]
+        enable-expert-parallel: false
+  benchmark:
+    concurrencies: ["4x8x16x32x64x128x256x512", "4"]

From f61dbba884147e3a20536c1369f9c8dcc2372cfb Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 17 Apr 2026 16:16:29 -0700
Subject: [PATCH 05/14] Add lm-eval benchmark runner for InferenceX evals (#12)

* Add lm-eval benchmark runner for InferenceX evals

Adds support for running lm-eval accuracy evaluations as a post-benchmark
step, leveraging the InferenceX benchmark_lib.sh harness.
---
 docs/accuracy.md                              |  84 +++-
 src/srtctl/benchmarks/__init__.py             |   3 +-
 src/srtctl/benchmarks/lm_eval.py              |  58 +++
 .../benchmarks/scripts/lm-eval/bench.sh       |  77 ++++
 src/srtctl/cli/do_sweep.py                    | 136 +++++-
 src/srtctl/core/runtime.py                    |   8 +
 tests/test_benchmarks.py                      | 418 ++++++++++++++++++
 tests/test_configs.py                         | 110 +++++
 8 files changed, 890 insertions(+), 4 deletions(-)
 create mode 100644 src/srtctl/benchmarks/lm_eval.py
 create mode 100755 src/srtctl/benchmarks/scripts/lm-eval/bench.sh

diff --git a/docs/accuracy.md b/docs/accuracy.md
index f5588c9f..98b69b46 100644
--- a/docs/accuracy.md
+++ b/docs/accuracy.md
@@ -1,6 +1,6 @@
 # Accuracy Benchmarks
 
-In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`.
+In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa`, `longbenchv2`, and `lm-eval`.
 
 ## Table of Contents
 
@@ -14,6 +14,7 @@ In srt-slurm, users can run different accuracy benchmarks by setting the benchma
   - [Example: Quick Validation](#example-quick-validation)
   - [Output](#output)
   - [Important Notes](#important-notes)
+- [lm-eval (InferenceX)](#lm-eval-inferencex)
 
 ---
 
@@ -191,3 +192,84 @@ The output includes per-category scores and aggregate metrics:
 4. **Categories**: Running specific categories is useful for targeted validation (e.g., just testing summarization capabilities)
 
 
+## lm-eval (InferenceX)
+
+The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`.
+
+This is used by InferenceX CI to run evals such as GSM8K and GPQA against NVIDIA multi-node disaggregated deployments on GB200, GB300, B200, B300, H100, and H200. AMD MI355X multi-node evals are handled by InferenceX's upstreamed AMD Slurm path, not by this srt-slurm runner.
+
+In InferenceX CI, recipes normally keep their throughput benchmark configuration. `do_sweep.py` invokes the registered `lm-eval` runner as a post-step when `RUN_EVAL=true`, or as the only benchmark-like step when `EVAL_ONLY=true`. There is no separate `infmax-eval` benchmark type.
+
+### How it works
+
+1. `RuntimeContext` mounts the host path from `INFMAX_WORKSPACE` at `/infmax-workspace` inside the Slurm container.
+2. `do_sweep.py` starts infrastructure, workers, and the frontend for the normal recipe topology.
+3. For `EVAL_ONLY=true`, `do_sweep.py` skips the throughput benchmark stage and runs `_run_post_eval()` directly after frontend startup.
+4. `_run_post_eval()` waits for the OpenAI-compatible endpoint on port 8000 and, in eval-only mode, performs the full `wait_for_model()` health check for the configured prefill/decode or aggregated topology.
+5. `_run_post_eval()` launches the registered `lm-eval` runner on the head node and passes through InferenceX metadata such as framework, precision, sequence length, prefill/decode topology, and eval concurrency.
+6. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) uses `MODEL_NAME` from `do_sweep.py`, or auto-discovers the served model from `/v1/models` as a fallback.
+7. The runner sources `/infmax-workspace/benchmarks/benchmark_lib.sh`, runs `run_eval --framework lm-eval`, and calls `append_lm_eval_summary`.
+8. Eval artifacts are copied to `/logs/eval_results/` for InferenceX launcher-side artifact pickup.
+
+### EVAL_ONLY mode
+
+srt-slurm supports an `EVAL_ONLY` mode for CI jobs that should only validate accuracy. This is controlled by environment variables from the InferenceX workflow:
+
+| Env var | Description |
+|---------|-------------|
+| `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only |
+| `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes |
+| `EVAL_CONC` | Concurrent requests for lm-eval, normally set by InferenceX from the generated `eval-conc` value |
+| `INFMAX_WORKSPACE` | Host path to the InferenceX checkout that should be mounted at `/infmax-workspace` |
+| `MODEL_NAME` | Served model alias for OpenAI-compatible requests; set by `do_sweep.py` from `config.served_model_name` |
+
+When `EVAL_ONLY=true`:
+- Stage 4 skips the throughput benchmark entirely. No throughput result JSON is expected from srt-slurm.
+- The eval path uses the full `wait_for_model()` health check before starting lm-eval.
+- `_run_post_eval()` launches the `lm-eval` runner and returns its exit code.
+- Eval failure is fatal because eval is the only purpose of the job.
+
+When `RUN_EVAL=true` (without `EVAL_ONLY`):
+- Throughput benchmark runs normally
+- After benchmark completes successfully, eval runs as a post-step
+- Eval failure is non-fatal; the benchmark job still succeeds if throughput passed
+
+### Environment variables
+
+The following env vars are passed through to the lm-eval runner container:
+
+| Env var | Purpose |
+|---------|---------|
+| `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE` | Control whether eval runs and how InferenceX classifies the artifact |
+| `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `SPEC_DECODING` | Benchmark identity metadata for `meta_env.json` |
+| `ISL`, `OSL`, `RESULT_FILENAME` | Sequence length and result-file metadata |
+| `MODEL`, `MODEL_PATH`, `MODEL_NAME` | Model metadata and the served model alias used for requests |
+| `MAX_MODEL_LEN`, `EVAL_MAX_MODEL_LEN` | Context-length metadata used by InferenceX eval helpers when available |
+| `PREFILL_TP`, `PREFILL_EP`, `PREFILL_NUM_WORKERS`, `PREFILL_DP_ATTN` | Prefill-side topology metadata |
+| `DECODE_TP`, `DECODE_EP`, `DECODE_NUM_WORKERS`, `DECODE_DP_ATTN` | Decode-side topology metadata |
+| `EVAL_CONC`, `EVAL_CONCURRENT_REQUESTS` | Eval concurrency controls |
+
+The runner maps srt-slurm's `PREFILL_DP_ATTN` and `DECODE_DP_ATTN` names to InferenceX's `PREFILL_DP_ATTENTION` and `DECODE_DP_ATTENTION` names before calling `append_lm_eval_summary`. This is required for multi-node summary tables to preserve prefill/decode DPA state.
+
+### Concurrency
+
+Eval concurrency is ultimately read by InferenceX's `benchmark_lib.sh` from `EVAL_CONCURRENT_REQUESTS`. The runner script sets that value from `EVAL_CONC` when present, preserves an existing `EVAL_CONCURRENT_REQUESTS` otherwise, and falls back to `256` only if neither variable is set:
+
+```bash
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}"
+```
+
+The InferenceX workflow sets `EVAL_CONC` from the generated `eval-conc` value. For multi-node configs, InferenceX selects the `8k1k` entry with the highest max eligible concurrency for each `(model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)` group, then sets `eval-conc` to the upper median of that config's eligible concurrency list. If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the recipe benchmark concurrency list.
+
+### Output
+
+Eval artifacts are written to `/logs/eval_results/` inside the container:
+- `meta_env.json` - metadata used by InferenceX aggregation and summary tables
+- `results*.json` - lm-eval scores per task
+- `sample*.jsonl` - per-sample outputs
+
+These are collected by the InferenceX NVIDIA launch scripts and uploaded as workflow artifacts. In eval-only mode the InferenceX workflow expects eval artifacts, not throughput benchmark artifacts.
+
+### Intricacies
+1. Eval floor of 16
+  - There is 1 sweep config of conc: [1], which causes evals to take >4hrs to complete.
diff --git a/src/srtctl/benchmarks/__init__.py b/src/srtctl/benchmarks/__init__.py
index 3a2d6449..088617a6 100644
--- a/src/srtctl/benchmarks/__init__.py
+++ b/src/srtctl/benchmarks/__init__.py
@@ -4,7 +4,7 @@
 """Benchmark runners for srtctl."""
 
 # Import runners to trigger registration
-from srtctl.benchmarks import gpqa, gsm8k, longbenchv2, mmlu, mooncake_router, router, sa_bench, sglang_bench
+from srtctl.benchmarks import gpqa, gsm8k, lm_eval, longbenchv2, mmlu, mooncake_router, router, sa_bench, sglang_bench
 from srtctl.benchmarks.base import (
     BenchmarkRunner,
     get_runner,
@@ -18,6 +18,7 @@
     "list_benchmarks",
     "register_benchmark",
     # Runners
+    "lm_eval",
     "sa_bench",
     "sglang_bench",
     "mmlu",
diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py
new file mode 100644
index 00000000..c63ec097
--- /dev/null
+++ b/src/srtctl/benchmarks/lm_eval.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 SemiAnalysis LLC. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""lm-eval benchmark runner for InferenceX evals."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark
+
+if TYPE_CHECKING:
+    from srtctl.core.runtime import RuntimeContext
+    from srtctl.core.schema import SrtConfig
+
+
+@register_benchmark("lm-eval")
+class LMEvalRunner(BenchmarkRunner):
+    """lm-eval accuracy evaluation using InferenceX benchmark_lib.
+
+    Runs lm-eval via the InferenceX benchmark_lib.sh harness,
+    which handles task selection, result collection, and summary generation.
+    """
+
+    @property
+    def name(self) -> str:
+        return "lm-eval"
+
+    @property
+    def script_path(self) -> str:
+        return "/srtctl-benchmarks/lm-eval/bench.sh"
+
+    @property
+    def local_script_dir(self) -> str:
+        return str(SCRIPTS_DIR / "lm-eval")
+
+    def validate_config(self, config: SrtConfig) -> list[str]:
+        # lm-eval has sensible defaults
+        return []
+
+    def build_command(
+        self,
+        config: SrtConfig,
+        runtime: RuntimeContext,
+    ) -> list[str]:
+        endpoint = f"http://localhost:{runtime.frontend_port}"
+        # Always use the container mount path, not the host path.
+        # INFMAX_WORKSPACE env var contains the host path (used for mount setup
+        # in runtime.py), but inside the container it's at /infmax-workspace.
+        infmax_workspace = "/infmax-workspace"
+
+        return [
+            "bash",
+            self.script_path,
+            endpoint,
+            infmax_workspace,
+        ]
diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
new file mode 100755
index 00000000..a10e4e7d
--- /dev/null
+++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 SemiAnalysis LLC. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# lm-eval accuracy evaluation using InferenceX benchmark_lib
+# Expects: endpoint [infmax_workspace]
+
+set -e
+
+ENDPOINT=$1
+INFMAX_WORKSPACE=${2:-/infmax-workspace}
+
+# Extract HOST and PORT from endpoint (e.g., http://localhost:8000)
+HOST=$(echo "$ENDPOINT" | sed -E 's|https?://||; s|:.*||')
+PORT=$(echo "$ENDPOINT" | sed -E 's|.*:([0-9]+).*|\1|')
+
+echo "lm-eval Config: endpoint=${ENDPOINT}; host=${HOST}; port=${PORT}; workspace=${INFMAX_WORKSPACE}"
+
+# Auto-discover the served model name from /v1/models if MODEL_NAME is not set.
+# This ensures we use the exact name the server recognizes, regardless of what
+# $MODEL (the HuggingFace ID from the workflow) is set to.
+if [[ -z "${MODEL_NAME:-}" ]]; then
+    DISCOVERED_MODEL=$(curl -sf "${ENDPOINT}/v1/models" 2>/dev/null \
+        | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data'][0]['id'])" 2>/dev/null || true)
+    if [[ -n "$DISCOVERED_MODEL" ]]; then
+        export MODEL_NAME="$DISCOVERED_MODEL"
+        echo "Auto-discovered MODEL_NAME from /v1/models: ${MODEL_NAME}"
+    else
+        echo "WARNING: Could not discover model name from /v1/models, using MODEL_NAME=${MODEL_NAME:-$MODEL}"
+    fi
+else
+    echo "Using MODEL_NAME from environment: ${MODEL_NAME}"
+fi
+
+# cd to workspace so that relative paths (e.g., utils/evals/*.yaml) resolve
+cd "${INFMAX_WORKSPACE}"
+
+# Source the InferenceX benchmark library
+source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh"
+
+# Run lm-eval via benchmark_lib
+# EVAL_CONC is set by the InferenceX workflow (median of conc list).
+# benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var.
+export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}"
+echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..."
+eval_rc=0
+run_eval --framework lm-eval --port "$PORT" || eval_rc=$?
+
+# Derive metadata env vars that append_lm_eval_summary needs but do_sweep.py
+# does not pass directly (it passes PREFILL_TP/EP/etc, not TP/EP_SIZE/CONC).
+export IS_MULTINODE="${IS_MULTINODE:-true}"
+export TP="${TP:-${PREFILL_TP:-1}}"
+export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}"
+export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}"
+export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}"
+# Remap srt-slurm's DP_ATTN names to InferenceX's DP_ATTENTION names
+export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}"
+export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}"
+
+# Generate the lm-eval summary
+echo "Generating lm-eval summary..."
+append_lm_eval_summary || true
+
+# Copy eval artifacts to /logs/eval_results/
+mkdir -p /logs/eval_results
+echo "Copying eval artifacts to /logs/eval_results/..."
+cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true
+cp -v results*.json /logs/eval_results/ 2>/dev/null || true
+cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true
+
+if [[ "$eval_rc" -ne 0 ]]; then
+    echo "lm-eval evaluation failed with exit code ${eval_rc}"
+    exit "$eval_rc"
+fi
+
+echo "lm-eval evaluation complete"
diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index ff6eaa91..77b79ac5 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -18,6 +18,7 @@
 import os
 import sys
 import threading
+import time
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -179,6 +180,118 @@ def _print_connection_info(self) -> None:
         logger.info("=" * 60)
         logger.info("")
 
+    def _run_post_eval(self, stop_event: threading.Event) -> int:
+        """Run lm-eval after the main benchmark completes (or directly in eval-only mode)."""
+        from srtctl.benchmarks import get_runner
+        from srtctl.core.health import wait_for_model
+
+        # In eval-only mode the benchmark health check was skipped, so do the
+        # full model-ready wait here.  In post-benchmark mode a quick port
+        # check is sufficient since the server already served traffic.
+        if os.environ.get("EVAL_ONLY", "false").lower() == "true":
+            r = self.config.resources
+            n_prefill = 0 if r.num_agg > 0 else r.num_prefill
+            n_decode = r.num_agg if r.num_agg > 0 else r.num_decode
+            hc = self.config.health_check
+            logger.info("EVAL_ONLY: Waiting for server health before eval...")
+            if not wait_for_model(
+                host=self.runtime.nodes.head,
+                port=8000,
+                n_prefill=n_prefill,
+                n_decode=n_decode,
+                poll_interval=float(hc.interval_seconds),
+                timeout=float(hc.max_attempts * hc.interval_seconds),
+                report_every=60.0,
+                frontend_type=self.config.frontend.type,
+                stop_event=stop_event,
+            ):
+                logger.error("Server did not become healthy for eval")
+                return 1
+        else:
+            if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30):
+                logger.error("Server health check failed before eval - skipping")
+                return 1
+
+        try:
+            runner = get_runner("lm-eval")
+        except ValueError as e:
+            logger.error("lm-eval runner not available: %s", e)
+            return 1
+
+        eval_log = self.runtime.log_dir / "eval.out"
+        cmd = runner.build_command(self.config, self.runtime)
+
+        logger.info("Eval command: %s", " ".join(cmd))
+        logger.info("Eval log: %s", eval_log)
+
+        # Pass through eval-related env vars. InferenceX writes multi-node
+        # metadata from these variables in append_lm_eval_summary().
+        env_to_set = {}
+        for var in [
+            "RUN_EVAL",
+            "EVAL_ONLY",
+            "IS_MULTINODE",
+            "FRAMEWORK",
+            "PRECISION",
+            "MODEL_PREFIX",
+            "RUNNER_TYPE",
+            "RESULT_FILENAME",
+            "SPEC_DECODING",
+            "ISL",
+            "OSL",
+            "MODEL",
+            "MODEL_PATH",
+            "MAX_MODEL_LEN",
+            "EVAL_MAX_MODEL_LEN",
+            "PREFILL_TP",
+            "PREFILL_EP",
+            "PREFILL_DP_ATTN",
+            "PREFILL_NUM_WORKERS",
+            "DECODE_TP",
+            "DECODE_EP",
+            "DECODE_DP_ATTN",
+            "DECODE_NUM_WORKERS",
+        ]:
+            val = os.environ.get(var)
+            if val:
+                env_to_set[var] = val
+
+        # Set MODEL_NAME to the served model name so lm-eval uses the correct
+        # name for API requests. Without this, benchmark_lib.sh falls back to
+        # $MODEL (the HuggingFace ID) which the server doesn't recognize.
+        env_to_set["MODEL_NAME"] = self.config.served_model_name
+        logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"])
+
+        # Use EVAL_CONC from workflow (median chosen by InferenceX mark_eval_entries),
+        # falling back to max of benchmark concurrency list.
+        eval_conc = os.environ.get("EVAL_CONC")
+        if eval_conc:
+            env_to_set["EVAL_CONC"] = eval_conc
+            logger.info("Eval concurrency (from workflow): %s", eval_conc)
+        else:
+            conc_list = self.config.benchmark.get_concurrency_list()
+            if conc_list:
+                env_to_set["EVAL_CONC"] = str(max(conc_list))
+                logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"])
+
+        proc = start_srun_process(
+            command=cmd,
+            nodelist=[self.runtime.nodes.head],
+            output=str(eval_log),
+            container_image=str(self.runtime.container_image),
+            container_mounts=self.runtime.container_mounts,
+            env_to_set=env_to_set,
+        )
+
+        while proc.poll() is None:
+            if stop_event.is_set():
+                logger.info("Stop requested, terminating eval")
+                proc.terminate()
+                return 1
+            time.sleep(1)
+
+        return proc.returncode or 0
+
     def run(self) -> int:
         """Run the complete sweep."""
         # Create status reporter (fire-and-forget, no-op if not configured)
@@ -221,8 +334,27 @@ def run(self) -> int:
 
             self._print_connection_info()
 
-            # Stage 4: Benchmark (status reported AFTER health check passes)
-            exit_code = self.run_benchmark(registry, stop_event, reporter)
+            if os.environ.get("EVAL_ONLY", "false").lower() == "true":
+                reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running eval-only evaluation")
+                logger.info("EVAL_ONLY=true: Skipping benchmark stage and running lm-eval evaluation...")
+                exit_code = self._run_post_eval(stop_event)
+                if exit_code != 0:
+                    logger.error("Eval-only evaluation failed with exit code %d", exit_code)
+                else:
+                    logger.info("Eval-only evaluation completed successfully")
+            else:
+                # Stage 4: Benchmark (status reported AFTER health check passes)
+                exit_code = self.run_benchmark(registry, stop_event, reporter)
+
+                # Stage 5: Post-benchmark eval (optional, non-fatal)
+                if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0:
+                    reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation")
+                    logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...")
+                    eval_exit = self._run_post_eval(stop_event)
+                    if eval_exit != 0:
+                        logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit)
+                    else:
+                        logger.info("Post-benchmark eval completed successfully")
 
         except Exception as e:
             logger.exception("Error during sweep: %s", e)
diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py
index 3e68bdd5..31195ed3 100644
--- a/src/srtctl/core/runtime.py
+++ b/src/srtctl/core/runtime.py
@@ -231,6 +231,14 @@ def from_config(
                 host_path, container_path = mount_spec.split(":", 1)
                 container_mounts[Path(host_path).resolve()] = Path(container_path)
 
+        # Mount InferenceX workspace if available (for lm-eval support).
+        # Skip exists() check: the orchestrator runs on the SLURM head node
+        # where the GH Actions workspace path may not be directly accessible,
+        # but it IS accessible from compute nodes via shared filesystem.
+        infmax_ws = os.environ.get("INFMAX_WORKSPACE")
+        if infmax_ws:
+            container_mounts[Path(infmax_ws)] = Path("/infmax-workspace")
+
         # Add FormattablePath mounts from config.container_mounts
         # These need to be expanded with the runtime context, so we create a
         # temporary context first and then update
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
index 261020c7..c15759b2 100644
--- a/tests/test_benchmarks.py
+++ b/tests/test_benchmarks.py
@@ -193,6 +193,62 @@ def test_build_command_includes_tokenizer_path(self):
         assert cmd[7] == "/model"  # tokenizer path
 
 
+class TestLMEvalRunner:
+    """Test LM-Eval runner."""
+
+    def test_registry_includes_lm_eval(self):
+        """lm-eval is in the benchmark registry."""
+        assert "lm-eval" in list_benchmarks()
+
+    def test_get_runner(self):
+        """Can get lm-eval runner."""
+        runner = get_runner("lm-eval")
+        assert runner.name == "lm-eval"
+
+    def test_script_path(self):
+        """Script path points to lm-eval bench.sh."""
+        runner = get_runner("lm-eval")
+        assert "lm-eval/bench.sh" in runner.script_path
+
+    def test_local_script_dir(self):
+        """Local script dir points to lm-eval scripts."""
+        runner = get_runner("lm-eval")
+        assert runner.local_script_dir.endswith("lm-eval")
+
+    def test_validate_config_always_valid(self):
+        """lm-eval accepts any config."""
+        from srtctl.benchmarks.lm_eval import LMEvalRunner
+        from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        runner = LMEvalRunner()
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model", container="/image", precision="fp4"),
+            resources=ResourceConfig(gpu_type="h100"),
+            benchmark=BenchmarkConfig(type="sa-bench"),
+        )
+        assert runner.validate_config(config) == []
+
+    def test_build_command(self):
+        """build_command returns correct bash command."""
+        from unittest.mock import MagicMock
+
+        from srtctl.benchmarks.lm_eval import LMEvalRunner
+
+        runner = LMEvalRunner()
+        runtime = MagicMock()
+        runtime.frontend_port = 8000
+
+        config = MagicMock()
+        cmd = runner.build_command(config, runtime)
+        assert cmd == [
+            "bash",
+            "/srtctl-benchmarks/lm-eval/bench.sh",
+            "http://localhost:8000",
+            "/infmax-workspace",
+        ]
+
+
 class TestScriptsExist:
     """Test that benchmark scripts exist."""
 
@@ -209,3 +265,365 @@ def test_mmlu_script_exists(self):
         """MMLU script exists."""
         script = SCRIPTS_DIR / "mmlu" / "bench.sh"
         assert script.exists()
+
+
+class TestRunPostEval:
+    """Test SweepOrchestrator._run_post_eval method."""
+
+    @staticmethod
+    def _make_orchestrator():
+        """Create a SweepOrchestrator with mocked config/runtime."""
+        from pathlib import Path
+
+        from srtctl.cli.do_sweep import SweepOrchestrator
+        from srtctl.core.runtime import Nodes, RuntimeContext
+        from srtctl.core.schema import (
+            BenchmarkConfig,
+            FrontendConfig,
+            HealthCheckConfig,
+            ModelConfig,
+            ResourceConfig,
+            SrtConfig,
+        )
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model/test-model", container="/image", precision="fp4"),
+            resources=ResourceConfig(
+                gpu_type="h100",
+                gpus_per_node=8,
+                prefill_nodes=1,
+                decode_nodes=2,
+                prefill_workers=1,
+                decode_workers=2,
+            ),
+            benchmark=BenchmarkConfig(type="sa-bench", isl=1024, osl=1024, concurrencies="128x256x512"),
+            health_check=HealthCheckConfig(max_attempts=3, interval_seconds=1),
+            frontend=FrontendConfig(type="dynamo"),
+        )
+        runtime = RuntimeContext(
+            job_id="12345",
+            run_name="test-run",
+            nodes=Nodes(head="node0", bench="node0", infra="node0", worker=("node0", "node1", "node2")),
+            head_node_ip="10.0.0.1",
+            infra_node_ip="10.0.0.1",
+            log_dir=Path("/tmp/logs"),
+            model_path=Path("/model/test-model"),
+            container_image=Path("/path/to/container.sqsh"),
+            gpus_per_node=8,
+            network_interface=None,
+            container_mounts={},
+            environment={},
+        )
+        return SweepOrchestrator(config=config, runtime=runtime)
+
+    def test_post_benchmark_port_check_fails(self):
+        """Returns 1 when port check fails in post-benchmark mode."""
+        import os
+        import threading
+        from unittest.mock import patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+        with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=False):
+                result = orch._run_post_eval(stop)
+        assert result == 1
+
+    def test_eval_only_health_check_fails(self):
+        """Returns 1 when health check fails in eval-only mode."""
+        import os
+        import threading
+        from unittest.mock import patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+        with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False):
+            with patch("srtctl.core.health.wait_for_model", return_value=False):
+                result = orch._run_post_eval(stop)
+        assert result == 1
+
+    def test_runner_not_available(self):
+        """Returns 1 when lm-eval runner is not registered."""
+        import os
+        import threading
+        from unittest.mock import patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+        with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.benchmarks.get_runner", side_effect=ValueError("not found")):
+                    result = orch._run_post_eval(stop)
+        assert result == 1
+
+    def test_successful_eval(self):
+        """Returns 0 when eval completes successfully."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.side_effect = [None, 0]
+        mock_proc.returncode = 0
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc):
+                    result = orch._run_post_eval(stop)
+        assert result == 0
+
+    def test_eval_only_successful(self):
+        """Returns 0 in eval-only mode when health check and eval succeed."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.side_effect = [None, 0]
+        mock_proc.returncode = 0
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False):
+            with patch("srtctl.core.health.wait_for_model", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc):
+                    result = orch._run_post_eval(stop)
+        assert result == 0
+
+    def test_env_var_passthrough(self):
+        """Eval env vars are passed through to srun."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.return_value = 0
+        mock_proc.returncode = 0
+
+        env_vars = {
+            "EVAL_ONLY": "false",
+            "RUN_EVAL": "true",
+            "FRAMEWORK": "sglang",
+            "PRECISION": "fp4",
+            "MODEL": "test-model",
+        }
+
+        captured_kwargs = {}
+
+        def capture_srun(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_proc
+
+        with patch.dict(os.environ, env_vars, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun):
+                    orch._run_post_eval(stop)
+
+        env_to_set = captured_kwargs["env_to_set"]
+        assert env_to_set["RUN_EVAL"] == "true"
+        assert env_to_set["FRAMEWORK"] == "sglang"
+        assert env_to_set["PRECISION"] == "fp4"
+        assert env_to_set["MODEL"] == "test-model"
+        assert env_to_set["MODEL_NAME"] == "test-model"
+
+    def test_eval_conc_from_env(self):
+        """EVAL_CONC from env takes priority over benchmark concurrencies."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.return_value = 0
+        mock_proc.returncode = 0
+
+        captured_kwargs = {}
+
+        def capture_srun(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_proc
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false", "EVAL_CONC": "64"}, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun):
+                    orch._run_post_eval(stop)
+
+        assert captured_kwargs["env_to_set"]["EVAL_CONC"] == "64"
+
+    def test_eval_conc_fallback_to_max_concurrency(self):
+        """EVAL_CONC falls back to max of benchmark concurrencies."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.return_value = 0
+        mock_proc.returncode = 0
+
+        captured_kwargs = {}
+
+        def capture_srun(**kwargs):
+            captured_kwargs.update(kwargs)
+            return mock_proc
+
+        env = {"EVAL_ONLY": "false"}
+        # Remove EVAL_CONC if present
+        with patch.dict(os.environ, env, clear=False):
+            os.environ.pop("EVAL_CONC", None)
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun):
+                    orch._run_post_eval(stop)
+
+        # concurrencies="128x256x512", max is 512
+        assert captured_kwargs["env_to_set"]["EVAL_CONC"] == "512"
+
+    def test_stop_event_terminates_eval(self):
+        """Stop event terminates the eval process."""
+        import os
+        import threading
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+        stop = threading.Event()
+        stop.set()
+
+        mock_proc = MagicMock()
+        mock_proc.poll.return_value = None
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False):
+            with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True):
+                with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc):
+                    result = orch._run_post_eval(stop)
+
+        assert result == 1
+        mock_proc.terminate.assert_called_once()
+
+
+class TestSweepRunEvalIntegration:
+    """Test eval-related branches in SweepOrchestrator.run()."""
+
+    @staticmethod
+    def _make_orchestrator():
+        return TestRunPostEval._make_orchestrator()
+
+    def test_run_eval_only_mode(self):
+        """EVAL_ONLY=true skips benchmark and runs _run_post_eval."""
+        import os
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False):
+            with patch.object(orch, "start_head_infrastructure") as mock_head:
+                mock_head.return_value = MagicMock()
+                with patch.object(orch, "start_all_workers", return_value={}):
+                    with patch.object(orch, "start_frontend", return_value=[]):
+                        with patch.object(orch, "_run_post_eval", return_value=0) as mock_eval:
+                            with patch.object(orch, "run_benchmark") as mock_bench:
+                                with patch.object(orch, "run_postprocess"):
+                                    with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls:
+                                        mock_reporter_cls.from_config.return_value = MagicMock()
+                                        exit_code = orch.run()
+
+        mock_eval.assert_called_once()
+        mock_bench.assert_not_called()
+        assert exit_code == 0
+
+    def test_run_with_post_benchmark_eval(self):
+        """RUN_EVAL=true runs benchmark then _run_post_eval."""
+        import os
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False):
+            with patch.object(orch, "start_head_infrastructure") as mock_head:
+                mock_head.return_value = MagicMock()
+                with patch.object(orch, "start_all_workers", return_value={}):
+                    with patch.object(orch, "start_frontend", return_value=[]):
+                        with patch.object(orch, "run_benchmark", return_value=0) as mock_bench:
+                            with patch.object(orch, "_run_post_eval", return_value=0) as mock_eval:
+                                with patch.object(orch, "run_postprocess"):
+                                    with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls:
+                                        mock_reporter_cls.from_config.return_value = MagicMock()
+                                        exit_code = orch.run()
+
+        mock_bench.assert_called_once()
+        mock_eval.assert_called_once()
+        assert exit_code == 0
+
+    def test_run_eval_only_failure(self):
+        """EVAL_ONLY=true with eval failure returns non-zero exit code."""
+        import os
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False):
+            with patch.object(orch, "start_head_infrastructure") as mock_head:
+                mock_head.return_value = MagicMock()
+                with patch.object(orch, "start_all_workers", return_value={}):
+                    with patch.object(orch, "start_frontend", return_value=[]):
+                        with patch.object(orch, "_run_post_eval", return_value=1):
+                            with patch.object(orch, "run_postprocess"):
+                                with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls:
+                                    mock_reporter_cls.from_config.return_value = MagicMock()
+                                    exit_code = orch.run()
+
+        assert exit_code == 1
+
+    def test_run_post_benchmark_eval_failure_nonfatal(self):
+        """RUN_EVAL=true with eval failure still returns benchmark exit code 0."""
+        import os
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False):
+            with patch.object(orch, "start_head_infrastructure") as mock_head:
+                mock_head.return_value = MagicMock()
+                with patch.object(orch, "start_all_workers", return_value={}):
+                    with patch.object(orch, "start_frontend", return_value=[]):
+                        with patch.object(orch, "run_benchmark", return_value=0):
+                            with patch.object(orch, "_run_post_eval", return_value=1):
+                                with patch.object(orch, "run_postprocess"):
+                                    with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls:
+                                        mock_reporter_cls.from_config.return_value = MagicMock()
+                                        exit_code = orch.run()
+
+        assert exit_code == 0
+
+    def test_run_eval_skipped_when_benchmark_fails(self):
+        """RUN_EVAL=true but benchmark fails: eval is skipped."""
+        import os
+        from unittest.mock import MagicMock, patch
+
+        orch = self._make_orchestrator()
+
+        with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False):
+            with patch.object(orch, "start_head_infrastructure") as mock_head:
+                mock_head.return_value = MagicMock()
+                with patch.object(orch, "start_all_workers", return_value={}):
+                    with patch.object(orch, "start_frontend", return_value=[]):
+                        with patch.object(orch, "run_benchmark", return_value=1):
+                            with patch.object(orch, "_run_post_eval") as mock_eval:
+                                with patch.object(orch, "run_postprocess"):
+                                    with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls:
+                                        mock_reporter_cls.from_config.return_value = MagicMock()
+                                        exit_code = orch.run()
+
+        mock_eval.assert_not_called()
+        assert exit_code == 1
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 86d79cdb..0b4138d5 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -1382,3 +1382,113 @@ def test_agg_mode_no_disaggregation_flag(self):
         assert "--disaggregation-mode" not in cmd
         assert "--is-prefill-worker" not in cmd
         assert "--is-decode-worker" not in cmd
+
+
+class TestInfmaxWorkspaceMount:
+    """Test that INFMAX_WORKSPACE env var creates a container mount."""
+
+    def test_infmax_workspace_mount_added(self, tmp_path):
+        """RuntimeContext includes /infmax-workspace mount when env var is set."""
+        import os
+        import subprocess
+        from pathlib import Path
+        from unittest.mock import MagicMock, patch
+
+        from srtctl.core.runtime import RuntimeContext
+        from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig
+
+        model_path = tmp_path / "model"
+        model_path.mkdir()
+        container_path = tmp_path / "container.sqsh"
+        container_path.touch()
+
+        slurm_env = {
+            "SLURM_JOB_ID": "12345",
+            "SLURM_JOBID": "12345",
+            "SLURM_NODELIST": "gpu-[01-02]",
+            "SLURM_JOB_NUM_NODES": "2",
+            "SRTCTL_SOURCE_DIR": str(Path(__file__).parent.parent),
+            "INFMAX_WORKSPACE": "/actions/runner/workspace",
+        }
+
+        def mock_scontrol(cmd, **kwargs):
+            if cmd[0] == "scontrol" and "hostnames" in cmd:
+                result = MagicMock()
+                result.stdout = "gpu-01\ngpu-02"
+                result.returncode = 0
+                return result
+            raise subprocess.CalledProcessError(1, cmd)
+
+        with patch.dict(os.environ, slurm_env):
+            with patch("subprocess.run", mock_scontrol):
+                with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"):
+                    config = SrtConfig(
+                        name="test",
+                        model=ModelConfig(
+                            path=str(model_path),
+                            container=str(container_path),
+                            precision="fp8",
+                        ),
+                        resources=ResourceConfig(
+                            gpu_type="h100",
+                            gpus_per_node=8,
+                            prefill_nodes=1,
+                            decode_nodes=1,
+                        ),
+                    )
+                    runtime = RuntimeContext.from_config(config, job_id="12345")
+
+                    assert Path("/infmax-workspace") in runtime.container_mounts.values()
+
+    def test_infmax_workspace_mount_not_added_without_env(self, tmp_path):
+        """RuntimeContext does not include /infmax-workspace without env var."""
+        import os
+        import subprocess
+        from pathlib import Path
+        from unittest.mock import MagicMock, patch
+
+        from srtctl.core.runtime import RuntimeContext
+        from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig
+
+        model_path = tmp_path / "model"
+        model_path.mkdir()
+        container_path = tmp_path / "container.sqsh"
+        container_path.touch()
+
+        slurm_env = {
+            "SLURM_JOB_ID": "12345",
+            "SLURM_JOBID": "12345",
+            "SLURM_NODELIST": "gpu-[01-02]",
+            "SLURM_JOB_NUM_NODES": "2",
+            "SRTCTL_SOURCE_DIR": str(Path(__file__).parent.parent),
+        }
+
+        def mock_scontrol(cmd, **kwargs):
+            if cmd[0] == "scontrol" and "hostnames" in cmd:
+                result = MagicMock()
+                result.stdout = "gpu-01\ngpu-02"
+                result.returncode = 0
+                return result
+            raise subprocess.CalledProcessError(1, cmd)
+
+        with patch.dict(os.environ, slurm_env):
+            os.environ.pop("INFMAX_WORKSPACE", None)
+            with patch("subprocess.run", mock_scontrol):
+                with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"):
+                    config = SrtConfig(
+                        name="test",
+                        model=ModelConfig(
+                            path=str(model_path),
+                            container=str(container_path),
+                            precision="fp8",
+                        ),
+                        resources=ResourceConfig(
+                            gpu_type="h100",
+                            gpus_per_node=8,
+                            prefill_nodes=1,
+                            decode_nodes=1,
+                        ),
+                    )
+                    runtime = RuntimeContext.from_config(config, job_id="12345")
+
+                    assert Path("/infmax-workspace") not in runtime.container_mounts.values()

From 10f4ac9ca2ee3b882b126f3006ab082c9fd623a2 Mon Sep 17 00:00:00 2001
From: Richard Huo <rihuo@nvidia.com>
Date: Mon, 20 Apr 2026 11:55:22 -0700
Subject: [PATCH 06/14] fix: add glm5 dynamo trtllm benchmark support to sa
 submission branch (#47)

* fix tokenizer for glm5 (#20)

fix

* add nvidia pre-release url (#22)
---
 src/srtctl/benchmarks/sa_bench.py             |   4 +
 .../scripts/sa-bench/backend_request_func.py  | 128 +++++++++++++++++-
 .../benchmarks/scripts/sa-bench/bench.sh      |  22 ++-
 .../scripts/sa-bench/benchmark_serving.py     |   9 ++
 src/srtctl/core/schema.py                     |   8 +-
 5 files changed, 162 insertions(+), 9 deletions(-)

diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py
index 9adc6678..5f220393 100644
--- a/src/srtctl/benchmarks/sa_bench.py
+++ b/src/srtctl/benchmarks/sa_bench.py
@@ -97,5 +97,9 @@ def build_command(
             str(prefill_gpus),
             str(decode_gpus),
             str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8",
+            str(b.num_prompts_mult) if b.num_prompts_mult is not None else "10",
+            str(b.num_warmup_mult) if b.num_warmup_mult is not None else "2",
+            b.custom_tokenizer or "",
+            str(b.use_chat_template).lower(),
         ]
         return cmd
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
index dd2cac44..87f3f9ef 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
@@ -511,10 +511,106 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
+def _resolve_tokenizer_file(model_name_or_path):
+    """Resolve tokenizer.json from a local directory or HF hub cache."""
+    from pathlib import Path
+
+    local_path = Path(model_name_or_path) / "tokenizer.json"
+    if local_path.is_file():
+        return str(local_path)
+    try:
+        from huggingface_hub import hf_hub_download
+
+        return hf_hub_download(model_name_or_path, "tokenizer.json", local_files_only=True)
+    except Exception:
+        return None
+
+
+def _fix_v5_tokenizer_components(tokenizer, model_name_or_path):
+    """Fix pre_tokenizer/decoder when transformers v5 LlamaTokenizerFast overwrites them.
+
+    In transformers v5, LlamaTokenizerFast.__init__ rebuilds the pre_tokenizer
+    and decoder from scratch, discarding the originals from tokenizer.json.
+    This breaks models like DeepSeek-R1 that declare LlamaTokenizerFast but
+    actually use a ByteLevel pre_tokenizer.
+
+    Ported from sglang/python/sglang/srt/utils/hf_transformers_utils.py.
+    """
+    backend = getattr(tokenizer, "_tokenizer", None)
+    if backend is None:
+        return
+
+    try:
+        from tokenizers import Tokenizer as RawTokenizer
+
+        tok_file = _resolve_tokenizer_file(model_name_or_path)
+        if tok_file is None:
+            return
+        raw = RawTokenizer.from_file(tok_file)
+    except Exception:
+        return
+
+    raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None
+    loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None
+
+    if raw_pre and loaded_pre and raw_pre != loaded_pre:
+        print(
+            f"[sa-bench] Fixing v5 tokenizer component mismatch for {model_name_or_path}: "
+            f"pre_tokenizer {loaded_pre} -> {raw_pre}, "
+            f"decoder {type(backend.decoder).__name__ if backend.decoder else None} "
+            f"-> {type(raw.decoder).__name__ if raw.decoder else None}",
+            flush=True,
+        )
+        backend.pre_tokenizer = raw.pre_tokenizer
+        backend.decoder = raw.decoder
+
+
+def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrainedTokenizerFast":
+    """Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json.
+
+    Works around incompatibilities when the checkpoint was saved with
+    transformers 5.x (TokenizersBackend / list-style extra_special_tokens).
+    """
+    import json
+    from pathlib import Path
+
+    from tokenizers import Tokenizer as RustTokenizer
+    from transformers import PreTrainedTokenizerFast
+
+    _SAFE_CONFIG_KEYS = (
+        "pad_token", "pad_token_id", "eos_token", "eos_token_id",
+        "bos_token", "bos_token_id", "unk_token", "unk_token_id",
+        "model_max_length", "padding_side", "truncation_side",
+    )
+
+    path = Path(pretrained_model_name_or_path)
+    tokenizer_json = path / "tokenizer.json"
+    if not tokenizer_json.exists():
+        raise FileNotFoundError(
+            f"Expected tokenizer.json at {tokenizer_json}. "
+            "GlmMoeDsaTokenizer loads from tokenizer.json only."
+        )
+
+    rust_tok = RustTokenizer.from_file(str(tokenizer_json))
+    init_kwargs = {}
+    config_path = path / "tokenizer_config.json"
+    if config_path.exists():
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        for key in _SAFE_CONFIG_KEYS:
+            if key in config:
+                init_kwargs[key] = config[key]
+        if "extra_special_tokens" in config:
+            init_kwargs["additional_special_tokens"] = config["extra_special_tokens"]
+
+    return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs)
+
+
 def get_tokenizer(
     pretrained_model_name_or_path: str,
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
+    custom_tokenizer: str | None = None,
     **kwargs,
 ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
@@ -533,12 +629,32 @@ def get_tokenizer(
                 "to use mistral tokenizer mode."
             ) from e
         return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
-    else:
-        return AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+
+    if custom_tokenizer:
+        if custom_tokenizer == "glm_moe_dsa":
+            return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path)
+        from importlib import import_module
+        try:
+            module_path, class_name = custom_tokenizer.rsplit('.', 1)
+            module = import_module(module_path)
+            tokenizer_class = getattr(module, class_name)
+            return tokenizer_class.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        except (ValueError, ImportError, AttributeError) as e:
+            raise ValueError(
+                f"Failed to load custom_tokenizer '{custom_tokenizer}'. "
+                "Expected 'glm_moe_dsa' or 'module.path.ClassName'.") from e
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path,
+        trust_remote_code=trust_remote_code,
+        **kwargs,
+    )
+    _fix_v5_tokenizer_components(tokenizer, pretrained_model_name_or_path)
+    return tokenizer
 
 
 ASYNC_REQUEST_FUNCS = {
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
index ed907308..acddf754 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
+++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
@@ -60,6 +60,22 @@ TOTAL_GPUS=${9:-0}
 PREFILL_GPUS=${10:-0}
 DECODE_GPUS=${11:-0}
 RANDOM_RANGE_RATIO=${12:-0.8}
+NUM_PROMPTS_MULT=${13:-10}
+NUM_WARMUP_MULT=${14:-2}
+CUSTOM_TOKENIZER=${15:-}
+USE_CHAT_TEMPLATE=${16:-true}
+
+# Build optional custom tokenizer args
+CUSTOM_TOKENIZER_ARGS=()
+if [ -n "$CUSTOM_TOKENIZER" ]; then
+    CUSTOM_TOKENIZER_ARGS=(--custom-tokenizer "$CUSTOM_TOKENIZER")
+fi
+
+# Build optional chat template args
+CHAT_TEMPLATE_ARGS=()
+if [ "$USE_CHAT_TEMPLATE" = "true" ]; then
+    CHAT_TEMPLATE_ARGS=(--use-chat-template)
+fi
 
 # Parse endpoint into host:port
 HOST=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f1)
@@ -119,7 +135,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --request-rate 250 \
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
-        --trust-remote-code
+        --trust-remote-code \
+        "${CUSTOM_TOKENIZER_ARGS[@]}"
 
     num_prompts=$((concurrency * 10))
     
@@ -149,7 +166,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
         --trust-remote-code \
-        --use-chat-template \
+        "${CHAT_TEMPLATE_ARGS[@]}" \
+        "${CUSTOM_TOKENIZER_ARGS[@]}" \
         --save-result --result-dir "$result_dir" --result-filename "$result_filename"
     set +x
 
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
index 4363ef6e..a5ea6490 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
@@ -837,6 +837,7 @@ def main(args: argparse.Namespace):
         tokenizer_id,
         tokenizer_mode=tokenizer_mode,
         trust_remote_code=args.trust_remote_code,
+        custom_tokenizer=args.custom_tokenizer,
     )
 
     if args.dataset is not None:
@@ -1279,6 +1280,14 @@ def main(args: argparse.Namespace):
         '"custom" will use --tokenizer to select the preregistered tokenizer.',
     )
 
+    parser.add_argument(
+        "--custom-tokenizer",
+        type=str,
+        default=None,
+        help="Custom tokenizer to use (e.g., 'glm_moe_dsa' or 'module.path.ClassName'). "
+        "When set, overrides the default tokenizer loading.",
+    )
+
     parser.add_argument(
         "--served-model-name",
         type=str,
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index 085db6c8..c535be39 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -539,6 +539,12 @@ class BenchmarkConfig:
     ttft_threshold_ms: int | None = None  # Goodput TTFT threshold in ms (default: 2000)
     itl_threshold_ms: int | None = None  # Goodput ITL threshold in ms (default: 25)
     random_range_ratio: float | None = None  # Random input/output length range ratio (default: 0.8)
+    num_prompts_mult: int | None = None  # Multiplier for num_prompts = concurrency * mult (default: 10)
+    num_warmup_mult: int | None = None  # Multiplier for warmup prompts = concurrency * mult (default: 2)
+    # Trace replay benchmark fields (uses aiperf with mooncake_trace dataset type)
+    trace_file: str | None = None  # Path to trace JSONL file (container path, e.g., /traces/dataset.jsonl)
+    custom_tokenizer: str | None = None  # Custom tokenizer class (e.g., "module.path.ClassName")
+    use_chat_template: bool = True  # Pass --use-chat-template to benchmark (default: true)
 
     def get_concurrency_list(self) -> list[int]:
         if self.concurrencies is None:
@@ -711,7 +717,7 @@ def get_install_commands(self) -> str:
         if self.version is not None:
             return (
                 f"echo 'Installing dynamo {self.version}...' && "
-                f"pip install --break-system-packages --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && "
+                f"pip install --break-system-packages --quiet --extra-index-url https://pypi.nvidia.com ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && "
                 f"echo 'Dynamo {self.version} installed'"
             )
 

From a10acd3f097a24498f0fb18523544f87dc35bed5 Mon Sep 17 00:00:00 2001
From: Yeswanth koti <yeswanthk@nvidia.com>
Date: Tue, 21 Apr 2026 20:23:27 -0400
Subject: [PATCH 07/14] Add GLM5 disaggregated recipes for SA submission (#48)

Add 66 GLM5 NVFP4 disaggregated recipe configs for GB200 and GB300 on the sa-submission branch; standardize model path and container values across the recipe set for consistency.
---
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++
 ...ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++
 ..._gen1dep32_batch16_allconc_eplb0_mtp3.yaml | 133 ++++++++++++
 .../ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml | 134 ++++++++++++
 ...4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml | 136 ++++++++++++
 .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml  | 131 ++++++++++++
 ...2dep4_gen1dep16_batch256_eplb256_mtp1.yaml | 167 +++++++++++++++
 ...3dep4_gen1dep32_batch128_eplb288_mtp1.yaml | 151 ++++++++++++++
 ...ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 142 +++++++++++++
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 126 +++++++++++
 ...tx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++
 ...2dep4_gen1dep16_batch512_eplb256_mtp0.yaml | 193 +++++++++++++++++
 ...ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 133 ++++++++++++
 ...4dep4_gen1dep32_batch256_eplb288_mtp0.yaml | 161 +++++++++++++++
 ...tx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++
 .../ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml | 133 ++++++++++++
 ...p4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml | 133 ++++++++++++
 .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml  | 130 ++++++++++++
 .../ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml | 130 ++++++++++++
 .../ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++
 ...ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++
 ...ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml | 147 +++++++++++++
 ...x10dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++
 .../ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml | 130 ++++++++++++
 .../ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml | 129 ++++++++++++
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 126 +++++++++++
 ...ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++
 ...ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++
 .../ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++
 ...2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml | 134 ++++++++++++
 .../ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 136 ++++++++++++
 .../ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 131 ++++++++++++
 ...ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml | 139 +++++++++++++
 ...ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 133 ++++++++++++
 ...ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml | 135 ++++++++++++
 ...4dep2_gen1dep16_batch256_eplb256_mtp1.yaml | 166 +++++++++++++++
 ...ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml | 195 ++++++++++++++++++
 ...6dep2_gen1dep32_batch128_eplb288_mtp1.yaml | 150 ++++++++++++++
 ...ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++
 ...2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml | 134 ++++++++++++
 ...p2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 125 +++++++++++
 ...ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++
 ...ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml | 133 ++++++++++++
 ...4dep2_gen1dep16_batch512_eplb256_mtp0.yaml | 191 +++++++++++++++++
 ...tx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml | 141 +++++++++++++
 ...6dep2_gen1dep32_batch256_eplb288_mtp0.yaml | 160 ++++++++++++++
 ...tx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++
 ...tx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml | 147 +++++++++++++
 ...tx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 133 ++++++++++++
 ...tx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++
 .../ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 134 ++++++++++++
 .../ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml  | 133 ++++++++++++
 ...p2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml | 132 ++++++++++++
 .../ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 131 ++++++++++++
 .../ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 131 ++++++++++++
 .../ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++
 ...tx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml | 133 ++++++++++++
 ...tx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++
 .../ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 127 ++++++++++++
 .../ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml  | 126 +++++++++++
 ...p2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 125 +++++++++++
 ...x20dep2_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++
 .../ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 130 ++++++++++++
 .../ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 134 ++++++++++++
 ...ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++
 66 files changed, 9122 insertions(+)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml

diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..21edc148
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,135 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 666
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..ebcd45d1
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,139 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..68af65ee
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrencies: 333 (batch8), 666 (batch16)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "333x666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml
new file mode 100644
index 00000000..d6d3dcf1
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml
@@ -0,0 +1,134 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp2"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16
+# concurrency: 96
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 48
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "96"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..da187faf
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,136 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32
+# concurrencies: 8 (batch1), 44 (batch8), 192 (batch32)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8x44x192"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..a6121cd0
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,131 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1
+# concurrency: 10
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
new file mode 100644
index 00000000..dc176b2d
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
@@ -0,0 +1,167 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256
+# EPLB: num_slots=256
+# concurrency: 4301
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 256
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
new file mode 100644
index 00000000..a7a1c790
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
@@ -0,0 +1,151 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1"
+
+# ctx: 3 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
+# EPLB: num_slots=288
+# concurrency: 4301
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 288
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..7412a109
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..e969c07d
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,142 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=128
+# Merged concurrencies: batch1(4), batch32(180), batch64(360), batch128(616)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x180x360x616"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..fb583747
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8
+# Merged concurrencies: batch1(5), batch2(15), batch4(30), batch8(50)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5x15x30x50"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..e057ce05
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,141 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# concurrency: 2253
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
new file mode 100644
index 00000000..d221dde2
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
@@ -0,0 +1,193 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch512_eplb256_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=512
+# EPLB: num_slots=256
+# concurrency: 8192
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 256
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..bbad79c1
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
+# concurrency: 2253
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
new file mode 100644
index 00000000..26d2d29e
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
@@ -0,0 +1,161 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0"
+
+# ctx: 4 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256
+# EPLB: num_slots=288
+# concurrency: 8192
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 4
+  prefill_workers: 4
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 288
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..420192c2
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,139 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1"
+
+# ctx: 10 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 10
+  prefill_workers: 10
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..da3186e5
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 2 decode workers, TP8/EP8, max_batch=16, concurrency: 46
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "46"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..fb94a549
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, max_batch=8
+# concurrencies: 4 (batch1), 48 (batch8)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x48"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..0a13cce4
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,130 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=1, concurrency: 5
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml
new file mode 100644
index 00000000..440a4f73
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,130 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep32_batch4_eplb0_mtp3"
+
+# ctx: 3 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, max_batch=4, concurrency: 167
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "167"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..492f1b4c
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,132 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8
+# concurrency: 333
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..d22fbcf1
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,135 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2"
+
+# ctx: 7 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 615
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
new file mode 100644
index 00000000..804e89b5
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
@@ -0,0 +1,147 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1"
+
+# ctx: 7 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128
+# concurrency: 1076
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1076"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..0fa8566d
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,141 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch128_eplb0_mtp0"
+
+# ctx: 10 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# concurrency: 2253
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 10
+  prefill_workers: 10
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..478f6203
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,130 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 2 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32
+# concurrency: 84
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "84"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..462401b6
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen3tep4_batch32_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 3 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=32
+# concurrency: 117
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "117"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..90e62af3
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8
+# Merged concurrencies: batch1(5), batch2(10), batch4(25), batch8(50)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5x10x25x50"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..7a6ece31
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrency: 615
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..7e34b6d9
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp0"
+
+# ctx: 8 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 8
+  prefill_workers: 8
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..80aacc6a
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,132 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen1dep32_batch8_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8
+# concurrency: 333
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..648ec949
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,134 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16
+# concurrencies: 24 (batch4), 44 (batch8), 92 (batch16)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "24x44x92"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..823624ac
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,136 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch32_eplb0_mtp2"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32
+# concurrency: 180
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..64b61b9f
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,131 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen5tep4_batch1_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1
+# concurrency: 10
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
new file mode 100644
index 00000000..66d211aa
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
@@ -0,0 +1,139 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep16_batch64_eplb0_mtp2"
+
+# ctx: 2 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 192
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..fe754372
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep32_batch16_eplb0_mtp3"
+
+# ctx: 2 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrency: 666
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..70821f3e
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,135 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep2_gen1dep32_batch32_eplb0_mtp2"
+
+# ctx: 3 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
new file mode 100644
index 00000000..bf3183b7
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
@@ -0,0 +1,166 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep16_batch256_eplb256_mtp1"
+
+# ctx: 4 prefill workers, TP2/EP2, EPLB: num_slots=256, max_batch=256
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256
+# concurrency: 4301
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 256
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
new file mode 100644
index 00000000..1d9f4f10
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
@@ -0,0 +1,195 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx5dep2_gen2dep8_batch512_eplb0_mtp1"
+
+# ctx: 5 prefill workers, TP2/EP2
+# gen: 2 decode workers, TP8/EP8, enable_attention_dp=true, max_batch=512
+# concurrency: 8602
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 3
+  prefill_workers: 5
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8602"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
new file mode 100644
index 00000000..44b81b3c
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
@@ -0,0 +1,150 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx6dep2_gen1dep32_batch128_eplb288_mtp1"
+
+# ctx: 6 prefill workers, TP2/EP2, EPLB: num_slots=288, max_batch=128
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
+# concurrency: 4301
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 3
+  prefill_workers: 6
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 288
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..0410623b
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen1dep32_batch16_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrency: 615
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..d967e3b2
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,134 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=64
+# Merged concurrencies: batch16(84), batch32(180), batch64(336)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "84x180x336"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..d9f9ea2f
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,125 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=4
+# Merged concurrencies: batch1(5), batch2(10), batch4(25)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5x10x25"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..26ddd7b1
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep32_batch32_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP2/EP2, max_batch=32
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..081e96da
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep2_gen1dep32_batch64_eplb0_mtp0"
+
+# ctx: 3 prefill workers, TP2/EP2, max_batch=64
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
+# concurrency: 2253
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
new file mode 100644
index 00000000..dbca4fd5
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
@@ -0,0 +1,191 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep16_batch512_eplb256_mtp0"
+
+# ctx: 4 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, EPLB: num_slots=256, max_batch=512, concurrency: 8192
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+  
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 256
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..1c8d2d78
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,141 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep32_batch128_eplb0_mtp0"
+
+# ctx: 4 prefill workers, TP2/EP2, max_batch=128
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
+# concurrency: 4301
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
new file mode 100644
index 00000000..0d6870ff
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
@@ -0,0 +1,160 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx6dep2_gen1dep32_batch256_eplb288_mtp0"
+
+# ctx: 6 prefill workers, EPLB: num_slots=288, TP2/EP2, max_batch=256
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256
+# concurrency: 8192
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 3
+  prefill_workers: 6
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          layer_updates_per_iter: 1
+          num_slots: 288
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 00000000..8940ea72
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,135 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx12dep2_gen1dep16_batch32_eplb0_mtp2"
+
+# ctx: 12 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 666
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 6
+  prefill_workers: 12
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
new file mode 100644
index 00000000..29eba0b3
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
@@ -0,0 +1,147 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx13dep2_gen1dep8_batch128_eplb0_mtp1"
+
+# ctx: 13 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128
+# concurrency: 1076
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 7
+  prefill_workers: 13
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1076"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..f8fcdac9
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx15dep2_gen1dep32_batch16_eplb0_mtp3"
+
+# ctx: 15 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrency: 666
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 8
+  prefill_workers: 15
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 00000000..775fa68f
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,139 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx18dep2_gen1dep16_batch64_eplb0_mtp1"
+
+# ctx: 18 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 9
+  prefill_workers: 18
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 00000000..c457cce0
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,134 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen1tep8_batch16_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 1 decode worker, TP8/EP8 (MNNVL), max_batch=16
+# concurrency: 24
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+  
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "24"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..517cf361
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen2tep8_batch8_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 2 decode workers, TP8/EP8 (MNNVL), max_batch=8
+# concurrency: 22
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "22"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
new file mode 100644
index 00000000..20599c3f
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
@@ -0,0 +1,132 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 4 decode workers, TP8/EP8 (MNNVL), max_batch=4
+# concurrencies: 4 (batch1), 24 (batch4)
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+  
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x24"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
new file mode 100644
index 00000000..0037f722
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,131 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen5tep4_batch1_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 5 decode workers, TP4/EP4, max_batch=1
+# concurrency: 5
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
new file mode 100644
index 00000000..6e233408
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,131 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep2_gen1dep32_batch4_eplb0_mtp3"
+
+# ctx: 5 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, enable_lm_head_tp_in_adp=true, max_batch=4
+# concurrency: 180
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 3
+  prefill_workers: 5
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
new file mode 100644
index 00000000..bd1cb583
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,132 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx9dep2_gen1dep32_batch8_eplb0_mtp3"
+
+# ctx: 9 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8
+# concurrency: 333
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 5
+  prefill_workers: 9
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..611aebb6
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,133 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx12dep2_gen1dep16_batch64_eplb0_mtp0"
+
+# ctx: 12 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1127
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 6
+  prefill_workers: 12
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1127"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..831e703d
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx15dep2_gen1dep32_batch32_eplb0_mtp0"
+
+# ctx: 15 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
+# concurrency: 1229
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 8
+  prefill_workers: 15
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..8ff2f420
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen2tep8_batch16_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 2 decode workers, TP8/EP8 (MNNVL), max_batch=16
+# concurrency: 42
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+  
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "42"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..cc8faa11
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen4tep8_batch1_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 4 decode workers, TP8/EP8 (MNNVL), max_batch=1
+# concurrency: 4
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
new file mode 100644
index 00000000..06d02024
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,125 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP2/EP2
+# gen: 5 decode workers, TP4/EP4, max_batch=4
+# concurrencies: 5 (batch1), 10 (batch2), 25 (batch4) — merged as 5x10x25
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5x10x25"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 00000000..ead937c9
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,141 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx20dep2_gen1dep16_batch128_eplb0_mtp0"
+
+# ctx: 20 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# concurrency: 2151
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+  
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 10
+  prefill_workers: 20
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2151"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 00000000..e06ea268
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,130 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx2dep2_gen3tep8_batch32_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP2/EP2
+# gen: 3 decode workers, TP8/EP8 (MNNVL), max_batch=32
+# concurrency: 117
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "117"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..f4b3cc09
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,134 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx4dep2_gen3tep8_batch64_eplb0_mtp0"
+
+# ctx: 4 prefill workers, TP2/EP2
+# gen: 3 decode workers, TP8/EP8 (MNNVL), max_batch=64
+# concurrency: 231
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      allreduce_strategy: MNNVL
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "231"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..75f56785
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "glm5_nvfp4_ISL8K_OSL1K_ctx9dep2_gen1dep32_batch16_eplb0_mtp0"
+
+# ctx: 9 prefill workers, TP2/EP2
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# concurrency: 615
+
+model:
+  path: "nvidia/GLM5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+
+  prefill_nodes: 5
+  prefill_workers: 9
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false

From b85ec1d7d269ced342f090c4bcecad797b289cbd Mon Sep 17 00:00:00 2001
From: Richard Huo <rihuo@nvidia.com>
Date: Mon, 27 Apr 2026 09:25:17 -0700
Subject: [PATCH 08/14] fix: add chat template to the glm5 tokenizer

---
 .../scripts/sa-bench/backend_request_func.py  |  9 +++-
 .../scripts/sa-bench/benchmark_serving.py     | 48 +++++++++++--------
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
index 87f3f9ef..6590cc2a 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
@@ -603,7 +603,14 @@ def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrain
         if "extra_special_tokens" in config:
             init_kwargs["additional_special_tokens"] = config["extra_special_tokens"]
 
-    return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs)
+    tok = PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs)
+
+    jinja_path = path / "chat_template.jinja"
+    if jinja_path.exists():
+        tok.chat_template = jinja_path.read_text(encoding="utf-8")
+        print(f"[sa-bench] Loaded chat template from {jinja_path}", flush=True)
+
+    return tok
 
 
 def get_tokenizer(
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
index a5ea6490..a2d6251b 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
@@ -341,44 +341,52 @@ def sample_random_requests(
     tokenizer: PreTrainedTokenizerBase,
     use_chat_template: bool = False,
 ) -> list[tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist()
     if use_chat_template:
-        chat_template_dummy = tokenizer.apply_chat_template(
-            [{"role": "user", "content": "a"}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
-        tokenized_chat_template_dummy = tokenizer.encode(chat_template_dummy, add_special_tokens=False)
-        chat_template_len = len(tokenized_chat_template_dummy) - 1
+        chat_template_len = len(tokenizer.encode(
+            tokenizer.apply_chat_template(
+                [{"role": "user", "content": "a"}],
+                add_generation_prompt=True,
+                tokenize=False,
+            ), add_special_tokens=False,
+        )) - 1
         input_len = input_len - chat_template_len
 
     input_lens = np.random.randint(
-        int(input_len * range_ratio),
+        int(input_len * range_ratio) if input_len > 1 else 1,
         input_len + 1,
         size=num_prompts,
     )
     output_lens = np.random.randint(
-        int(output_len * range_ratio),
+        int(output_len * range_ratio) if output_len > 1 else 1,
         output_len + 1,
         size=num_prompts,
     )
     offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
     input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(
-            prefix_token_ids + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])]
-        )
-        re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[: (prefix_len + input_lens[i])]
-        prompt = tokenizer.decode(re_encoded_sequence)
-        if use_chat_template:
+
+    if use_chat_template:
+        for i in range(num_prompts):
+            origin_text = tokenizer.decode(
+                [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(int(input_lens[i] * 1.5))]
+            )
+            re_encoded_sequence = tokenizer.encode(origin_text, add_special_tokens=False)[: input_lens[i]]
+            prompt_text = tokenizer.decode(re_encoded_sequence)
             prompt = tokenizer.apply_chat_template(
-                [{"role": "user", "content": prompt}],
+                [{"role": "user", "content": prompt_text}],
                 add_generation_prompt=True,
                 tokenize=False,
             )
             input_lens[i] += chat_template_len
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None))
+            input_requests.append((prompt, int(input_lens[i]), int(output_lens[i]), None))
+    else:
+        prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist()
+        for i in range(num_prompts):
+            prompt = tokenizer.decode(
+                prefix_token_ids + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])]
+            )
+            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[: (prefix_len + input_lens[i])]
+            prompt = tokenizer.decode(re_encoded_sequence)
+            input_requests.append((prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None))
 
     return input_requests
 

From 4e698b33c6e6c29e7144eaa4067eab47ab4d000e Mon Sep 17 00:00:00 2001
From: Yeswanth koti <yeswanthk@nvidia.com>
Date: Tue, 28 Apr 2026 19:44:43 -0400
Subject: [PATCH 09/14] Add GLM5 GB200 NVFP4 Apr-09 disagg recipes. (#61)

* Add GLM5 GB200 NVFP4 Apr-09 disagg recipes.

Include the updated 1K/1K and 8K/1K STP and MTP TensorRT-LLM Dynamo configs so submission testing can run on the latest GB200 parameter set.

* Keep only Apr-09 GB200 configs and align YAML quoting.

Remove legacy GB200 trtllm_dynamo recipes inherited from the submission base branch, and normalize concurrencies/custom_tokenizer fields to double-quoted style for consistency with existing GB300 recipes.

* fix: enable chat template and 16x rounds for GB200 GLM5 configs

Update GB200 GLM5 trtllm_dynamo recipes to set use_chat_template=true and num_prompts_mult=16 so sa-bench runs align with current submission benchmarking methodology.
---
 ..._gen1dep32_batch16_allconc_eplb0_mtp3.yaml |  85 +++-----
 ...4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml |  91 ++++-----
 .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml  |  77 +++----
 ...2dep4_gen1dep16_batch256_eplb256_mtp1.yaml | 144 ++++++-------
 ...tx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml} | 103 ++++------
 ...3dep4_gen1dep32_batch128_eplb288_mtp1.yaml | 112 +++++-----
 ...tx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml} | 106 +++++-----
 ...ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml |  87 ++++----
 ...ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml | 172 ++++++++++++++++
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 115 +++++------
 ...4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml} |  86 +++-----
 ...2dep4_gen1dep16_batch512_eplb256_mtp0.yaml | 193 ------------------
 ...ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml |  95 ++++-----
 ...x3dep4_gen1dep32_batch128_eplb0_mtp0.yaml} | 127 +++++-------
 ...4dep4_gen1dep32_batch256_eplb288_mtp0.yaml | 144 ++++++-------
 ...tx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml |  95 ++++-----
 .../ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml |  84 +++-----
 .../ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml} | 101 ++++-----
 ... ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml} |  85 +++-----
 ... ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml} |  83 +++-----
 .../ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml | 119 +++++++++++
 ...tx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml} | 113 ++++------
 .../ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml |  81 +++-----
 ...ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml |  87 ++++----
 ...ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml | 113 +++++-----
 ...11dep4_gen1dep16_batch128_eplb0_mtp0.yaml} | 115 +++++------
 .../ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml |  91 ++++-----
 .../ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml | 111 ++++++++++
 .../ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml  | 107 ++++++++++
 ... ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml} |  84 +++-----
 ... ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml} |  92 ++++-----
 .../ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml  | 108 ++++++++++
 .../ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml | 117 +++++++++++
 ...tx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml} | 103 ++++------
 ...ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml |  83 +++-----
 ...ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml | 116 +++++++++++
 36 files changed, 1996 insertions(+), 1829 deletions(-)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/{ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml => ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml} (60%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml => ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml} (60%)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/{ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml} (61%)
 delete mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/{ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml => ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml} (54%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml => ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml} (59%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/{ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml => ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml} (67%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/{ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml => ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml} (66%)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml => ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml} (56%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml => ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml} (59%)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml} (61%)
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml} (60%)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml
 rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml => ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml} (57%)
 create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml

diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
index 68af65ee..ea57cd6e 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
-# concurrencies: 333 (batch8), 666 (batch16)
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -87,11 +73,11 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -104,30 +90,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "333x666"
-  req_rate: "inf"
+  concurrencies: "333x615"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
index da187faf..f0eb3d82 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32
-# concurrencies: 8 (batch1), 44 (batch8), 192 (batch32)
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 4
   decode_nodes: 8
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,9 +56,7 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -88,13 +73,13 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -107,30 +92,28 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "8x44x192"
-  req_rate: "inf"
+  concurrencies: "24x36x96x192"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
index a6121cd0..3f5f7d0e 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1
-# concurrency: 10
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 5
   decode_nodes: 5
   gpus_per_decode: 4
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 4
       moe_expert_parallel_size: 4
@@ -87,9 +73,7 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
+        - 1
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -102,30 +86,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "10"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
index dc176b2d..c65ccf78 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml
@@ -1,49 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1"
-
-# ctx: 2 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256
-# EPLB: num_slots=256
-# concurrency: 4301
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 2
   prefill_workers: 2
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 4
   gpus_per_decode: 16
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -70,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -88,41 +73,41 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
-          - 136
-          - 144
-          - 152
-          - 160
-          - 168
-          - 176
-          - 184
-          - 192
-          - 200
-          - 208
-          - 216
-          - 224
-          - 232
-          - 240
-          - 248
-          - 256
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -138,30 +123,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "4301"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml
similarity index 60%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml
index 21edc148..a2c2bbe5 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
-# concurrency: 666
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch32_eplb0_mtp2
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 1
-  prefill_workers: 1
+  gpu_type: gb200
+  prefill_nodes: 2
+  prefill_workers: 2
   gpus_per_prefill: 4
-
   decode_workers: 1
-  decode_nodes: 4
-  gpus_per_decode: 16
-
+  decode_nodes: 8
+  gpus_per_decode: 32
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,10 +56,9 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 2
-
     decode:
-      tensor_parallel_size: 16
-      moe_expert_parallel_size: 16
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
       pipeline_parallel_size: 1
       enable_attention_dp: true
       enable_lm_head_tp_in_adp: true
@@ -87,49 +73,46 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
       kv_cache_config:
         dtype: fp8
         enable_block_reuse: false
-        free_gpu_memory_fraction: 0.7
+        free_gpu_memory_fraction: 0.6
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 2
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  concurrencies: "1229"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
index a7a1c790..9a180818 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml
@@ -1,49 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1"
-
-# ctx: 3 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
-# EPLB: num_slots=288
-# concurrency: 4301
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 3
   prefill_workers: 3
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -70,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -88,25 +73,25 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -122,30 +107,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "4301"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml
similarity index 60%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml
index 440a4f73..05cf4ff8 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml
@@ -1,47 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep32_batch4_eplb0_mtp3"
-
-# ctx: 3 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, max_batch=4, concurrency: 167
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch64_eplb0_mtp1
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 3
   prefill_workers: 3
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -51,9 +39,9 @@ backend:
       disable_overlap_scheduler: true
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 2
-      max_num_tokens: 16640
-      max_seq_len: 8232
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
       print_iter_log: true
       cuda_graph_config: null
       moe_config:
@@ -67,8 +55,7 @@ backend:
         max_tokens_in_buffer: 16384
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 3
-
+        num_nextn_predict_layers: 1
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -77,18 +64,26 @@ backend:
       enable_lm_head_tp_in_adp: true
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 4
-      max_num_tokens: 16
-      max_seq_len: 9256
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
       print_iter_log: true
       stream_interval: 100
       num_postprocess_workers: 4
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -101,30 +96,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 3
-
+        num_nextn_predict_layers: 1
 benchmark:
-  type: "sa-bench"
-  isl: 8192
+  type: sa-bench
+  isl: 1024
   osl: 1024
-  concurrencies: "167"
-  req_rate: "inf"
+  concurrencies: "2151"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
index 7412a109..27dc86c3 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
-# concurrency: 1229
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -84,13 +70,13 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -103,27 +89,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "1229"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 00000000..ddf38c05
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,172 @@
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch512_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
index e969c07d..7d26a743 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=128
-# Merged concurrencies: batch1(4), batch32(180), batch64(360), batch128(616)
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 4
   decode_nodes: 8
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,9 +53,7 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -85,25 +70,25 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -116,27 +101,25 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "4x180x360x616"
-  req_rate: "inf"
+  concurrencies: "84x168x336x616"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
similarity index 61%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
index fb583747..0a19b8b4 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8
-# Merged concurrencies: batch1(5), batch2(15), batch4(30), batch8(50)
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 5
   decode_nodes: 5
   gpus_per_decode: 4
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 4
       moe_expert_parallel_size: 4
@@ -75,8 +61,8 @@ backend:
       enable_lm_head_tp_in_adp: false
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 8
-      max_num_tokens: 8
+      max_batch_size: 4
+      max_num_tokens: 4
       max_seq_len: 2088
       print_iter_log: true
       stream_interval: 100
@@ -84,10 +70,9 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
+        - 1
+        - 2
+        - 4
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -100,27 +85,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "5x15x30x50"
-  req_rate: "inf"
+  concurrencies: "5x15x25"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
deleted file mode 100644
index d221dde2..00000000
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml
+++ /dev/null
@@ -1,193 +0,0 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch512_eplb256_mtp0"
-
-# ctx: 2 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=512
-# EPLB: num_slots=256
-# concurrency: 8192
-
-model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
-resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 2
-  prefill_workers: 2
-  gpus_per_prefill: 4
-
-  decode_workers: 1
-  decode_nodes: 4
-  gpus_per_decode: 16
-
-  gpus_per_node: 4
-
-backend:
-  type: trtllm
-
-  prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
-  decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
-  trtllm_config:
-    prefill:
-      tensor_parallel_size: 4
-      moe_expert_parallel_size: 4
-      pipeline_parallel_size: 1
-      enable_attention_dp: true
-      disable_overlap_scheduler: true
-      trust_remote_code: true
-      custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 16
-      max_num_tokens: 16384
-      max_seq_len: 1064
-      print_iter_log: true
-      cuda_graph_config: null
-      moe_config:
-        backend: CUTEDSL
-      kv_cache_config:
-        dtype: fp8
-        enable_block_reuse: false
-        free_gpu_memory_fraction: 0.6
-      cache_transceiver_config:
-        backend: UCX
-        max_tokens_in_buffer: 16384
-
-    decode:
-      tensor_parallel_size: 16
-      moe_expert_parallel_size: 16
-      pipeline_parallel_size: 1
-      enable_attention_dp: true
-      enable_lm_head_tp_in_adp: false
-      trust_remote_code: true
-      custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 512
-      max_num_tokens: 512
-      max_seq_len: 2088
-      print_iter_log: true
-      stream_interval: 100
-      num_postprocess_workers: 4
-      cuda_graph_config:
-        enable_padding: true
-        batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
-          - 136
-          - 144
-          - 152
-          - 160
-          - 168
-          - 176
-          - 184
-          - 192
-          - 200
-          - 208
-          - 216
-          - 224
-          - 232
-          - 240
-          - 248
-          - 256
-          - 264
-          - 272
-          - 280
-          - 288
-          - 296
-          - 304
-          - 312
-          - 320
-          - 328
-          - 336
-          - 344
-          - 352
-          - 360
-          - 368
-          - 376
-          - 384
-          - 392
-          - 400
-          - 408
-          - 416
-          - 424
-          - 432
-          - 440
-          - 448
-          - 456
-          - 464
-          - 472
-          - 480
-          - 488
-          - 496
-          - 504
-          - 512
-      moe_config:
-        backend: CUTEDSL
-        use_low_precision_moe_combine: true
-        load_balancer:
-          layer_updates_per_iter: 1
-          num_slots: 256
-      kv_cache_config:
-        dtype: fp8
-        enable_block_reuse: false
-        free_gpu_memory_fraction: 0.75
-      cache_transceiver_config:
-        backend: UCX
-        max_tokens_in_buffer: 16384
-      nvfp4_gemm_config:
-        allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "8192"
-  req_rate: "inf"
-  custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
-frontend:
-  type: "dynamo"
-  enable_multiple_frontends: false
-
-health_check:
-  max_attempts: 360
-  interval_seconds: 10
-
-dynamo:
-  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
index bbad79c1..cfecb846 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0"
-
-# ctx: 2 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
-# concurrency: 2253
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 2
   prefill_workers: 2
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -84,17 +70,17 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -107,27 +93,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "2253"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml
similarity index 54%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml
index e057ce05..7430fdb3 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp0"
-
-# ctx: 2 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
-# concurrency: 2253
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 2
-  prefill_workers: 2
+  gpu_type: gb200
+  prefill_nodes: 3
+  prefill_workers: 3
   gpus_per_prefill: 4
-
   decode_workers: 1
-  decode_nodes: 4
-  gpus_per_decode: 16
-
+  decode_nodes: 8
+  gpus_per_decode: 32
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,10 +53,9 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
-      tensor_parallel_size: 16
-      moe_expert_parallel_size: 16
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
       pipeline_parallel_size: 1
       enable_attention_dp: true
       enable_lm_head_tp_in_adp: false
@@ -84,58 +70,55 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
       kv_cache_config:
         dtype: fp8
         enable_block_reuse: false
-        free_gpu_memory_fraction: 0.75
+        free_gpu_memory_fraction: 0.7
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  concurrencies: "4301"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
index 26d2d29e..4d136e16 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml
@@ -1,49 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0"
-
-# ctx: 4 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256
-# EPLB: num_slots=288
-# concurrency: 8192
-
+name: glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 4
   prefill_workers: 4
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -67,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -85,41 +70,41 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
-          - 136
-          - 144
-          - 152
-          - 160
-          - 168
-          - 176
-          - 184
-          - 192
-          - 200
-          - 208
-          - 216
-          - 224
-          - 232
-          - 240
-          - 248
-          - 256
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -135,27 +120,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 1024
   osl: 1024
   concurrencies: "8192"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
index 420192c2..55ccb8ce 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1"
-
-# ctx: 10 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
-# concurrency: 1229
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 10
   prefill_workers: 10
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 4
   gpus_per_decode: 16
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -87,17 +73,17 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -110,30 +96,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "1229"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
index da3186e5..e585b7d7 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml
@@ -1,47 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 2 decode workers, TP8/EP8, max_batch=16, concurrency: 46
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 2
   decode_nodes: 4
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -68,9 +56,7 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -87,11 +73,11 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -104,30 +90,28 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "46"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
similarity index 59%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
index d6d3dcf1..9d93a18c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp2"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16
-# concurrency: 96
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 4
   decode_nodes: 8
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -52,9 +39,9 @@ backend:
       disable_overlap_scheduler: true
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 16
-      max_num_tokens: 16384
-      max_seq_len: 1064
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
       print_iter_log: true
       cuda_graph_config: null
       moe_config:
@@ -68,10 +55,8 @@ backend:
         max_tokens_in_buffer: 16384
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 2
-
+        num_nextn_predict_layers: 3
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -79,20 +64,16 @@ backend:
       enable_lm_head_tp_in_adp: false
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 16
-      max_num_tokens: 48
-      max_seq_len: 2088
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
       print_iter_log: true
       stream_interval: 100
       num_postprocess_workers: 4
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
+        - 1
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -105,30 +86,28 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 2
-
+        num_nextn_predict_layers: 3
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
-  isl: 1024
+  type: sa-bench
+  isl: 8192
   osl: 1024
-  concurrencies: "96"
-  req_rate: "inf"
+  concurrencies: "8"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml
similarity index 67%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml
index fb94a549..7326e4bd 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 4 decode workers, TP8/EP8, max_batch=8
-# concurrencies: 4 (batch1), 48 (batch8)
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 4
   decode_nodes: 8
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,9 +56,7 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -88,10 +73,10 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
+        - 1
+        - 2
+        - 4
+        - 8
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -104,30 +89,28 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "4x48"
-  req_rate: "inf"
+  concurrencies: "48"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml
similarity index 66%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml
index 0a13cce4..8d33decc 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml
@@ -1,47 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 5 decode workers, TP4/EP4, max_batch=1, concurrency: 5
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch2_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 5
   decode_nodes: 5
   gpus_per_decode: 4
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -68,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 4
       moe_expert_parallel_size: 4
@@ -77,8 +64,8 @@ backend:
       enable_lm_head_tp_in_adp: false
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 1
-      max_num_tokens: 4
+      max_batch_size: 2
+      max_num_tokens: 8
       max_seq_len: 9256
       print_iter_log: true
       stream_interval: 100
@@ -86,9 +73,8 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
+        - 1
+        - 2
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -101,30 +87,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  concurrencies: "15"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 00000000..b27d7ddd
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx2dep4_gen3tep8_batch32_eplb0_mtp3
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      allreduce_strategy: MNNVL
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "144"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml
similarity index 56%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml
index ebcd45d1..9f1be846 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
-# concurrency: 1229
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep16_batch16_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 1
-  prefill_workers: 1
+  gpu_type: gb200
+  prefill_nodes: 5
+  prefill_workers: 5
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 4
   gpus_per_decode: 16
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -52,9 +39,9 @@ backend:
       disable_overlap_scheduler: true
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 16
-      max_num_tokens: 16384
-      max_seq_len: 1064
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
       print_iter_log: true
       cuda_graph_config: null
       moe_config:
@@ -68,8 +55,7 @@ backend:
         max_tokens_in_buffer: 16384
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 1
-
+        num_nextn_predict_layers: 3
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -78,26 +64,20 @@ backend:
       enable_lm_head_tp_in_adp: true
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 64
-      max_num_tokens: 128
-      max_seq_len: 2088
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
       print_iter_log: true
       stream_interval: 100
       num_postprocess_workers: 4
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -110,30 +90,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
-        num_nextn_predict_layers: 1
-
+        num_nextn_predict_layers: 3
 benchmark:
-  type: "sa-bench"
-  isl: 1024
+  type: sa-bench
+  isl: 8192
   osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  concurrencies: "333"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
index 492f1b4c..54842280 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3"
-
-# ctx: 5 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8
-# concurrency: 333
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 5
   prefill_workers: 5
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -87,10 +73,10 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
+        - 1
+        - 2
+        - 4
+        - 8
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -103,30 +89,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "333"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
index d22fbcf1..ab957385 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2"
-
-# ctx: 7 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
-# concurrency: 615
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 7
   prefill_workers: 7
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 4
   gpus_per_decode: 16
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 2
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -87,13 +73,13 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -106,30 +92,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 2
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "615"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
index 804e89b5..9182158a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1"
-
-# ctx: 7 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128
-# concurrency: 1076
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 7
   prefill_workers: 7
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 2
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -69,7 +56,6 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
     decode:
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
@@ -87,25 +73,25 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -118,30 +104,27 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "1076"
-  req_rate: "inf"
+  concurrencies: "1127"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml
similarity index 59%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml
index 0fa8566d..ca299465 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch128_eplb0_mtp0"
-
-# ctx: 10 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
-# concurrency: 2253
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx11dep4_gen1dep16_batch128_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 10
-  prefill_workers: 10
+  gpu_type: gb200
+  prefill_nodes: 11
+  prefill_workers: 11
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 4
   gpus_per_decode: 16
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 16
       moe_expert_parallel_size: 16
@@ -84,25 +70,25 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
-          - 40
-          - 48
-          - 56
-          - 64
-          - 72
-          - 80
-          - 88
-          - 96
-          - 104
-          - 112
-          - 120
-          - 128
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -115,27 +101,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "2253"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
index 478f6203..e857d27e 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 2 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32
-# concurrency: 84
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 2
   decode_nodes: 4
   gpus_per_decode: 8
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,9 +53,7 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
-      allreduce_strategy: MNNVL
       tensor_parallel_size: 8
       moe_expert_parallel_size: 8
       pipeline_parallel_size: 1
@@ -85,13 +70,13 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -104,27 +89,25 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      allreduce_strategy: MNNVL
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "84"
-  req_rate: "inf"
+  concurrencies: "78"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 00000000..6281c402
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      allreduce_strategy: MNNVL
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "84"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 00000000..7d9d1002
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,107 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      allreduce_strategy: MNNVL
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "4"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
similarity index 61%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
index 90e62af3..f2527276 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8
-# Merged concurrencies: batch1(5), batch2(10), batch4(25), batch8(50)
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
   decode_workers: 5
   decode_nodes: 5
   gpus_per_decode: 4
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 4
       moe_expert_parallel_size: 4
@@ -75,8 +61,8 @@ backend:
       enable_lm_head_tp_in_adp: false
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 8
-      max_num_tokens: 8
+      max_batch_size: 1
+      max_num_tokens: 1
       max_seq_len: 9256
       print_iter_log: true
       stream_interval: 100
@@ -84,10 +70,7 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
+        - 1
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -100,27 +83,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "5x10x25x50"
-  req_rate: "inf"
+  concurrencies: "5"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml
similarity index 60%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml
index 462401b6..b2217d07 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen3tep4_batch32_eplb0_mtp0"
-
-# ctx: 1 prefill worker, TP4/EP4
-# gen: 3 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=32
-# concurrency: 117
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch2_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-
-  decode_workers: 3
-  decode_nodes: 3
+  decode_workers: 5
+  decode_nodes: 5
   gpus_per_decode: 4
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 4
       moe_expert_parallel_size: 4
@@ -75,8 +61,8 @@ backend:
       enable_lm_head_tp_in_adp: false
       trust_remote_code: true
       custom_tokenizer: "glm_moe_dsa"
-      max_batch_size: 32
-      max_num_tokens: 32
+      max_batch_size: 2
+      max_num_tokens: 2
       max_seq_len: 9256
       print_iter_log: true
       stream_interval: 100
@@ -84,13 +70,8 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
       moe_config:
         backend: TRTLLM
         use_low_precision_moe_combine: true
@@ -103,27 +84,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "117"
-  req_rate: "inf"
+  concurrencies: "10"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
new file mode 100644
index 00000000..9ddc2efd
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,108 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch4_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "25"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..f5abf5a0
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,117 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx2dep4_gen3tep8_batch64_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      allreduce_strategy: MNNVL
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "231"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml
similarity index 57%
rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml
rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml
index 7e34b6d9..0a62b740 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp0"
-
-# ctx: 8 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32
-# concurrency: 1229
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx4dep4_gen1dep16_batch32_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
-  prefill_nodes: 8
-  prefill_workers: 8
+  gpu_type: gb200
+  prefill_nodes: 4
+  prefill_workers: 4
   gpus_per_prefill: 4
-
   decode_workers: 1
-  decode_nodes: 8
-  gpus_per_decode: 32
-
+  decode_nodes: 4
+  gpus_per_decode: 16
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,10 +53,9 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
-      tensor_parallel_size: 32
-      moe_expert_parallel_size: 32
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
       pipeline_parallel_size: 1
       enable_attention_dp: true
       enable_lm_head_tp_in_adp: false
@@ -84,46 +70,43 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
-          - 24
-          - 32
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
       kv_cache_config:
         dtype: fp8
         enable_block_reuse: false
-        free_gpu_memory_fraction: 0.75
+        free_gpu_memory_fraction: 0.8
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  concurrencies: "564"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
index 7a6ece31..7da85327 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -1,48 +1,35 @@
-name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0"
-
-# ctx: 5 prefill workers, TP4/EP4
-# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
-# concurrency: 615
-
+name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0
 model:
-  path: "nvidia/GLM5-NVFP4"
-  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
-  precision: "fp4"
-
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
 resources:
-  gpu_type: "gb200"
-
+  gpu_type: gb200
   prefill_nodes: 5
   prefill_workers: 5
   gpus_per_prefill: 4
-
   decode_workers: 1
   decode_nodes: 8
   gpus_per_decode: 32
-
   gpus_per_node: 4
-
 backend:
   type: trtllm
-
   prefill_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   decode_environment:
-    ENROOT_ALLOW_DEV: "yes"
-    MIMALLOC_PURGE_DELAY: "0"
-    NCCL_GRAPH_MIXING_SUPPORT: "0"
-    TLLM_LOG_LEVEL: "INFO"
-    TRTLLM_ENABLE_PDL: "1"
-    TRTLLM_SERVER_DISABLE_GC: "1"
-    TRTLLM_WORKER_DISABLE_GC: "1"
-
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
   trtllm_config:
     prefill:
       tensor_parallel_size: 4
@@ -66,7 +53,6 @@ backend:
       cache_transceiver_config:
         backend: UCX
         max_tokens_in_buffer: 16384
-
     decode:
       tensor_parallel_size: 32
       moe_expert_parallel_size: 32
@@ -84,11 +70,11 @@ backend:
       cuda_graph_config:
         enable_padding: true
         batch_sizes:
-          - 1
-          - 2
-          - 4
-          - 8
-          - 16
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
       moe_config:
         backend: CUTEDSL
         use_low_precision_moe_combine: true
@@ -101,27 +87,24 @@ backend:
         max_tokens_in_buffer: 16384
       nvfp4_gemm_config:
         allowed_backends:
-          - cutlass
-          - cublaslt
-          - cutedsl
-          - cuda_core
-
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
   concurrencies: "615"
-  req_rate: "inf"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
-
 frontend:
-  type: "dynamo"
+  type: dynamo
   enable_multiple_frontends: false
-
 health_check:
   max_attempts: 360
   interval_seconds: 10
-
 dynamo:
   install: false
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml
new file mode 100644
index 00000000..e4a4b431
--- /dev/null
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,116 @@
+name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch64_eplb0_mtp0
+model:
+  path: nvidia/GLM5-NVFP4
+  container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3
+  precision: fp4
+resources:
+  gpu_type: gb200
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+  gpus_per_node: 4
+backend:
+  type: trtllm
+  prefill_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  decode_environment:
+    ENROOT_ALLOW_DEV: 'yes'
+    MIMALLOC_PURGE_DELAY: '0'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: "1127"
+  req_rate: inf
+  num_prompts_mult: 16
+  use_chat_template: true
+  custom_tokenizer: "glm_moe_dsa"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+dynamo:
+  install: false

From c88c68ea6b819d52357d2fed3055c6b0178badfb Mon Sep 17 00:00:00 2001
From: Yeswanth koti <yeswanthk@nvidia.com>
Date: Tue, 28 Apr 2026 19:45:19 -0400
Subject: [PATCH 10/14] fix: align glm5 gb300 sa-bench rounds with submission
 baselines (#113)

Set GLM5 GB300 trtllm_dynamo recipes to use chat template and num_prompts_mult=16 so throughput runs match TRTLLM multi-round methodology, while keeping warmup fixed at 2x.
---
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 3 ++-
 .../MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml     | 3 ++-
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 3 ++-
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 3 ++-
 .../MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml            | 3 ++-
 .../MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml            | 3 ++-
 .../MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml            | 3 ++-
 .../MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml         | 3 ++-
 .../MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml            | 3 ++-
 .../MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml         | 3 ++-
 .../STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml            | 3 ++-
 .../STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml     | 3 ++-
 .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml      | 3 ++-
 .../STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml            | 3 ++-
 .../STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml            | 3 ++-
 .../STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml         | 3 ++-
 .../STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml           | 3 ++-
 .../STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml         | 3 ++-
 .../MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml           | 3 ++-
 .../MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml           | 3 ++-
 .../MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml           | 3 ++-
 .../MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml           | 3 ++-
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 3 ++-
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml  | 3 ++-
 .../MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml      | 3 ++-
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 3 ++-
 .../ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 3 ++-
 .../ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 3 ++-
 .../STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml           | 3 ++-
 .../STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml           | 3 ++-
 .../ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 3 ++-
 .../ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml  | 3 ++-
 .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml      | 3 ++-
 .../STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml          | 3 ++-
 .../ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 3 ++-
 .../ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 3 ++-
 .../STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml            | 3 ++-
 src/srtctl/benchmarks/scripts/sa-bench/bench.sh               | 4 ++--
 38 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
index 80aacc6a..0b2b3771 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -117,8 +117,9 @@ benchmark:
   osl: 1024
   concurrencies: "333"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
index 648ec949..3a2447d2 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
@@ -119,8 +119,9 @@ benchmark:
   osl: 1024
   concurrencies: "24x44x92"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
index 823624ac..671ba92a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
@@ -121,8 +121,9 @@ benchmark:
   osl: 1024
   concurrencies: "180"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
index 64b61b9f..d916e313 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -116,8 +116,9 @@ benchmark:
   osl: 1024
   concurrencies: "10"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
index 66d211aa..821b1a1d 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
@@ -124,8 +124,9 @@ benchmark:
   osl: 1024
   concurrencies: "1229"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
index fe754372..cd7d2abc 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -118,8 +118,9 @@ benchmark:
   osl: 1024
   concurrencies: "666"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
index 70821f3e..7fff09e1 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
@@ -120,8 +120,9 @@ benchmark:
   osl: 1024
   concurrencies: "1229"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
index bf3183b7..36dd9e05 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
@@ -151,8 +151,9 @@ benchmark:
   osl: 1024
   concurrencies: "4301"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
index 1d9f4f10..80338977 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
@@ -180,8 +180,9 @@ benchmark:
   osl: 1024
   concurrencies: "8602"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
index 44b81b3c..42592680 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
@@ -135,8 +135,9 @@ benchmark:
   osl: 1024
   concurrencies: "4301"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
index 0410623b..f090f3db 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -112,8 +112,9 @@ benchmark:
   osl: 1024
   concurrencies: "615"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
index d967e3b2..d9cc7807 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
@@ -119,8 +119,9 @@ benchmark:
   osl: 1024
   concurrencies: "84x180x336"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
index d9f9ea2f..c50c85e9 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -110,8 +110,9 @@ benchmark:
   osl: 1024
   concurrencies: "5x10x25"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
index 26ddd7b1..9ec32b8a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -114,8 +114,9 @@ benchmark:
   osl: 1024
   concurrencies: "1229"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
index 081e96da..2887005e 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -118,8 +118,9 @@ benchmark:
   osl: 1024
   concurrencies: "2253"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
index dbca4fd5..ba0e1063 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
@@ -176,8 +176,9 @@ benchmark:
   osl: 1024
   concurrencies: "8192"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
index 1c8d2d78..81f0cac3 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -126,8 +126,9 @@ benchmark:
   osl: 1024
   concurrencies: "4301"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
index 0d6870ff..fbb91775 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
@@ -145,8 +145,9 @@ benchmark:
   osl: 1024
   concurrencies: "8192"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
index 8940ea72..ecb7bb5c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -120,8 +120,9 @@ benchmark:
   osl: 1024
   concurrencies: "666"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
index 29eba0b3..956b5de6 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
@@ -132,8 +132,9 @@ benchmark:
   osl: 1024
   concurrencies: "1076"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
index f8fcdac9..21f11cd5 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -118,8 +118,9 @@ benchmark:
   osl: 1024
   concurrencies: "666"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
index 775fa68f..358e4b7b 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -124,8 +124,9 @@ benchmark:
   osl: 1024
   concurrencies: "1229"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
index c457cce0..e3e2d993 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
@@ -119,8 +119,9 @@ benchmark:
   osl: 1024
   concurrencies: "24"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
index 517cf361..649566d8 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
@@ -118,8 +118,9 @@ benchmark:
   osl: 1024
   concurrencies: "22"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
index 20599c3f..e0a55bf5 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
@@ -117,8 +117,9 @@ benchmark:
   osl: 1024
   concurrencies: "4x24"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
index 0037f722..6deefd9f 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -116,8 +116,9 @@ benchmark:
   osl: 1024
   concurrencies: "5"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
index 6e233408..5bdbb4d9 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
@@ -116,8 +116,9 @@ benchmark:
   osl: 1024
   concurrencies: "180"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
index bd1cb583..acef0e9e 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -117,8 +117,9 @@ benchmark:
   osl: 1024
   concurrencies: "333"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
index 611aebb6..5fcd82c6 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
@@ -118,8 +118,9 @@ benchmark:
   osl: 1024
   concurrencies: "1127"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
index 831e703d..27122e16 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -114,8 +114,9 @@ benchmark:
   osl: 1024
   concurrencies: "1229"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
index 8ff2f420..d8386092 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
@@ -112,8 +112,9 @@ benchmark:
   osl: 1024
   concurrencies: "42"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
index cc8faa11..0c05b922 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
@@ -111,8 +111,9 @@ benchmark:
   osl: 1024
   concurrencies: "4"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
index 06d02024..a0ed195c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -110,8 +110,9 @@ benchmark:
   osl: 1024
   concurrencies: "5x10x25"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
index ead937c9..55bf1ae4 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -126,8 +126,9 @@ benchmark:
   osl: 1024
   concurrencies: "2151"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
index e06ea268..1836bb52 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
@@ -115,8 +115,9 @@ benchmark:
   osl: 1024
   concurrencies: "117"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
index f4b3cc09..4a022256 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
@@ -119,8 +119,9 @@ benchmark:
   osl: 1024
   concurrencies: "231"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
index 75f56785..7ea9a2f6 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -112,8 +112,9 @@ benchmark:
   osl: 1024
   concurrencies: "615"
   req_rate: "inf"
+  num_prompts_mult: 16
   custom_tokenizer: "glm_moe_dsa"
-  use_chat_template: false
+  use_chat_template: true
 
 frontend:
   type: "dynamo"
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
index acddf754..154d7590 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
+++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
@@ -83,7 +83,7 @@ PORT=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f2 | cut -d/ -f1)
 
 WORK_DIR="$(dirname "$0")"
 
-echo "SA-Bench Config: endpoint=${ENDPOINT}; isl=${ISL}; osl=${OSL}; concurrencies=${CONCURRENCIES}; req_rate=${REQ_RATE}; model=${MODEL_NAME}"
+echo "SA-Bench Config: endpoint=${ENDPOINT}; isl=${ISL}; osl=${OSL}; concurrencies=${CONCURRENCIES}; req_rate=${REQ_RATE}; model=${MODEL_NAME}; num_prompts_mult=${NUM_PROMPTS_MULT}; num_warmup_mult=${NUM_WARMUP_MULT}"
 
 # Profiling shared helpers
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -138,7 +138,7 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --trust-remote-code \
         "${CUSTOM_TOKENIZER_ARGS[@]}"
 
-    num_prompts=$((concurrency * 10))
+    num_prompts=$((concurrency * NUM_PROMPTS_MULT))
     
     # Generate result filename based on mode
     if [ "$IS_DISAGGREGATED" = "true" ]; then

From 95b0a33dafeb46886977648af8c9247938d88500 Mon Sep 17 00:00:00 2001
From: Richard Huo <rihuo@nvidia.com>
Date: Wed, 29 Apr 2026 12:34:03 -0700
Subject: [PATCH 11/14] fix: using a setup script to install pip in trtllm venv
 # (#117)

---
 configs/install-trtllm-pip.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100755 configs/install-trtllm-pip.sh

diff --git a/configs/install-trtllm-pip.sh b/configs/install-trtllm-pip.sh
new file mode 100755
index 00000000..2cfa2df8
--- /dev/null
+++ b/configs/install-trtllm-pip.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# WRN to install pip in the dynamo trtllm runtime image's venv
+
+uv pip install pip

From 0f0aa605525f76fad70a7a229151e0cdd2c8acf6 Mon Sep 17 00:00:00 2001
From: Yeswanth koti <yeswanthk@nvidia.com>
Date: Wed, 29 Apr 2026 17:06:12 -0400
Subject: [PATCH 12/14] fix: add trtllm venv pip bootstrap to GB300 GLM5
 recipes (#120)

Add setup_script install-trtllm-pip.sh to all GB300 GLM5 trtllm_dynamo recipes so eval-only jobs can install lm-eval even when pip is missing in the runtime container venv.
---
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 2 ++
 .../MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml     | 2 ++
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 2 ++
 .../ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 2 ++
 .../MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml            | 2 ++
 .../MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml            | 2 ++
 .../MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml            | 2 ++
 .../MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml         | 2 ++
 .../MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml            | 2 ++
 .../MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml         | 2 ++
 .../STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml            | 2 ++
 .../STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml     | 2 ++
 .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml      | 2 ++
 .../STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml            | 2 ++
 .../STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml            | 2 ++
 .../STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml         | 4 +++-
 .../STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml           | 2 ++
 .../STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml         | 2 ++
 .../MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml           | 2 ++
 .../MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml           | 2 ++
 .../MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml           | 2 ++
 .../MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml           | 2 ++
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 4 +++-
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml  | 2 ++
 .../MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml      | 4 +++-
 .../ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml  | 2 ++
 .../ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 2 ++
 .../ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 2 ++
 .../STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml           | 2 ++
 .../STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml           | 2 ++
 .../ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 4 +++-
 .../ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml  | 2 ++
 .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml      | 2 ++
 .../STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml          | 4 +++-
 .../ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 2 ++
 .../ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 2 ++
 .../STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml            | 2 ++
 37 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
index 0b2b3771..5483257c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
index 3a2447d2..68ce1ced 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
index 671ba92a..05d57bf8 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
index d916e313..076c7643 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
index 821b1a1d..08dcf1a4 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
index cd7d2abc..930a79d2 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
index 7fff09e1..63417d84 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
index 36dd9e05..81863a5b 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
index 80338977..3c6551d0 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
index 42592680..7d613f26 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
index f090f3db..539fd2c6 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
index d9cc7807..19e92ad0 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
index c50c85e9..cf5fc790 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
index 9ec32b8a..37d8cc94 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
index 2887005e..7477b620 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
index ba0e1063..ec55cf3c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml
@@ -7,7 +7,9 @@ model:
   path: "nvidia/GLM5-NVFP4"
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
-  
+
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
index 81f0cac3..31a8591a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
index fbb91775..c057ebb2 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
index ecb7bb5c..1c95cd35 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
index 956b5de6..bbb5197a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
index 21f11cd5..9b013f4f 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
index 358e4b7b..20fca94c 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
index e3e2d993..43927017 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml
@@ -8,7 +8,9 @@ model:
   path: "nvidia/GLM5-NVFP4"
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
-  
+
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
index 649566d8..d92652b8 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
index e0a55bf5..9b9539c0 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml
@@ -8,7 +8,9 @@ model:
   path: "nvidia/GLM5-NVFP4"
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
-  
+
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
index 6deefd9f..2612cb70 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
index 5bdbb4d9..34d90b87 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
index acef0e9e..2a136d0a 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
index 5fcd82c6..bce80c02 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
index 27122e16..91ef3af7 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
index d8386092..48979338 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml
@@ -7,7 +7,9 @@ model:
   path: "nvidia/GLM5-NVFP4"
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
-  
+
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
index 0c05b922..3328694f 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
index a0ed195c..06dc30af 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
index 55bf1ae4..dc600b4f 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -8,7 +8,9 @@ model:
   path: "nvidia/GLM5-NVFP4"
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
-  
+
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
index 1836bb52..9b27d13b 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
index 4a022256..d9f0d7fa 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 
diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
index 7ea9a2f6..53877eb2 100644
--- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
+++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml
@@ -9,6 +9,8 @@ model:
   container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3"
   precision: "fp4"
 
+setup_script: "install-trtllm-pip.sh"
+
 resources:
   gpu_type: "gb300"
 

From fdb1be7ade5c7dec77d716d02e8869177113593d Mon Sep 17 00:00:00 2001
From: Richard Huo <rihuo@nvidia.com>
Date: Wed, 29 Apr 2026 21:46:47 -0700
Subject: [PATCH 13/14] run setup script before post eval (#123)

---
 src/srtctl/cli/do_sweep.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index 77b79ac5..96c90462 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -274,6 +274,14 @@ def _run_post_eval(self, stop_event: threading.Event) -> int:
                 env_to_set["EVAL_CONC"] = str(max(conc_list))
                 logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"])
 
+        bash_preamble = None
+        if self.config.setup_script:
+            script_path = f"/configs/{self.config.setup_script}"
+            bash_preamble = (
+                f"echo 'Running setup script: {script_path}' && "
+                f"if [ -f '{script_path}' ]; then bash '{script_path}'; else echo 'WARNING: {script_path} not found'; fi"
+            )
+
         proc = start_srun_process(
             command=cmd,
             nodelist=[self.runtime.nodes.head],
@@ -281,6 +289,7 @@ def _run_post_eval(self, stop_event: threading.Event) -> int:
             container_image=str(self.runtime.container_image),
             container_mounts=self.runtime.container_mounts,
             env_to_set=env_to_set,
+            bash_preamble=bash_preamble,
         )
 
         while proc.poll() is None:

From 9ecc31f1f22c0fd56aa6eee9e1933692d284912f Mon Sep 17 00:00:00 2001
From: Jason Li <jasonlizhengjian@gmail.com>
Date: Thu, 28 May 2026 17:05:44 -0400
Subject: [PATCH 14/14] Add spread worker placement and vLLM colocation (#181)

* Add spread_workers option to ResourceConfig

Allow placing each partial-node worker on its own node instead of
packing multiple onto the same node. Useful when colocating workers
on a single node causes resource contention (port collisions, etc.).

Caller must reserve enough nodes (e.g. set decode_nodes=decode_workers
when gpus_per_decode<gpus_per_node).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* try fix

* allow multiple DEP2 workers per node

* multi worker fix

* Allow vLLM one-node prefill decode colocation

* Avoid same-node worker port collisions

* Fix spread workers tests and lint

* Cover vLLM colocation guard

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: hjjq <50634613+hjjq@users.noreply.github.com>
---
 src/srtctl/backends/base.py       |   1 +
 src/srtctl/backends/sglang.py     |   2 +
 src/srtctl/backends/trtllm.py     |   2 +
 src/srtctl/backends/vllm.py       |  58 ++++++-
 src/srtctl/cli/do_sweep.py        |   1 +
 src/srtctl/cli/submit.py          |   2 +-
 src/srtctl/core/schema.py         |  20 +++
 src/srtctl/core/topology.py       |  59 +++++--
 tests/test_configs.py             | 260 ++++++++++++++++++++++++++++++
 tests/test_endpoint_allocation.py |  53 +++++-
 10 files changed, 437 insertions(+), 21 deletions(-)

diff --git a/src/srtctl/backends/base.py b/src/srtctl/backends/base.py
index 62904ff1..f8b6e815 100644
--- a/src/srtctl/backends/base.py
+++ b/src/srtctl/backends/base.py
@@ -82,6 +82,7 @@ def allocate_endpoints(
         gpus_per_agg: int,
         gpus_per_node: int,
         available_nodes: Sequence[str],
+        spread_workers: bool = False,
     ) -> list["Endpoint"]:
         """Allocate logical endpoints based on resource requirements."""
         ...
diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
index 1f4b818d..84935744 100644
--- a/src/srtctl/backends/sglang.py
+++ b/src/srtctl/backends/sglang.py
@@ -179,6 +179,7 @@ def allocate_endpoints(
         gpus_per_agg: int,
         gpus_per_node: int,
         available_nodes: Sequence[str],
+        spread_workers: bool = False,
     ) -> list["Endpoint"]:
         """Allocate endpoints to nodes."""
         from srtctl.core.topology import allocate_endpoints
@@ -192,6 +193,7 @@ def allocate_endpoints(
             gpus_per_agg=gpus_per_agg,
             gpus_per_node=gpus_per_node,
             available_nodes=available_nodes,
+            spread_workers=spread_workers,
         )
 
     def endpoints_to_processes(
diff --git a/src/srtctl/backends/trtllm.py b/src/srtctl/backends/trtllm.py
index b706ec16..1e3af14e 100644
--- a/src/srtctl/backends/trtllm.py
+++ b/src/srtctl/backends/trtllm.py
@@ -127,6 +127,7 @@ def allocate_endpoints(
         gpus_per_agg: int,
         gpus_per_node: int,
         available_nodes: Sequence[str],
+        spread_workers: bool = False,
     ) -> list["Endpoint"]:
         """Allocate endpoints to nodes."""
         from srtctl.core.topology import allocate_endpoints
@@ -140,6 +141,7 @@ def allocate_endpoints(
             gpus_per_agg=gpus_per_agg,
             gpus_per_node=gpus_per_node,
             available_nodes=available_nodes,
+            spread_workers=spread_workers,
         )
 
     def endpoints_to_processes(
diff --git a/src/srtctl/backends/vllm.py b/src/srtctl/backends/vllm.py
index 1acbd50c..ef3058ae 100644
--- a/src/srtctl/backends/vllm.py
+++ b/src/srtctl/backends/vllm.py
@@ -63,6 +63,7 @@ class VLLMProtocol:
         backend:
           type: vllm
           connector: nixl  # translated to --kv-transfer-config JSON
+          allow_prefill_decode_colocation: true  # pack P/D on one node when all workers fit
           prefill_environment:
             PYTHONUNBUFFERED: "1"
           vllm_config:
@@ -91,6 +92,11 @@ class VLLMProtocol:
     # dynamo 1.0.0+: translated to --kv-transfer-config (--connector was removed).
     connector: str | None = "nixl"
 
+    # Allow prefill and decode workers to share one node when the combined GPU
+    # request fits within gpus_per_node. Defaults off to preserve existing P/D
+    # node separation.
+    allow_prefill_decode_colocation: bool = False
+
     Schema: ClassVar[builtins.type[Schema]] = Schema
 
     # =========================================================================
@@ -154,6 +160,26 @@ def get_served_model_name(self, default: str) -> str:
                         return name
         return default
 
+    def should_colocate_prefill_decode(
+        self,
+        *,
+        num_prefill: int,
+        num_decode: int,
+        num_agg: int,
+        gpus_per_prefill: int,
+        gpus_per_decode: int,
+        gpus_per_agg: int,
+        gpus_per_node: int,
+    ) -> bool:
+        """Whether all vLLM workers should be packed onto one node."""
+        if not self.allow_prefill_decode_colocation:
+            return False
+        if num_prefill <= 0 or num_decode <= 0 or gpus_per_node <= 0:
+            return False
+
+        total_worker_gpus = num_prefill * gpus_per_prefill + num_decode * gpus_per_decode + num_agg * gpus_per_agg
+        return total_worker_gpus <= gpus_per_node
+
     def allocate_endpoints(
         self,
         num_prefill: int,
@@ -164,6 +190,7 @@ def allocate_endpoints(
         gpus_per_agg: int,
         gpus_per_node: int,
         available_nodes: Sequence[str],
+        spread_workers: bool = False,
     ) -> list[Endpoint]:
         """Allocate endpoints to nodes."""
         from srtctl.core.topology import allocate_endpoints
@@ -177,6 +204,16 @@ def allocate_endpoints(
             gpus_per_agg=gpus_per_agg,
             gpus_per_node=gpus_per_node,
             available_nodes=available_nodes,
+            spread_workers=spread_workers,
+            allow_prefill_decode_colocation=self.should_colocate_prefill_decode(
+                num_prefill=num_prefill,
+                num_decode=num_decode,
+                num_agg=num_agg,
+                gpus_per_prefill=gpus_per_prefill,
+                gpus_per_decode=gpus_per_decode,
+                gpus_per_agg=gpus_per_agg,
+                gpus_per_node=gpus_per_node,
+            ),
         )
 
     def _is_dp_mode(self, mode: WorkerMode) -> bool:
@@ -249,6 +286,13 @@ def endpoints_to_processes(
                 # DP+EP mode: one process per GPU
                 # Each process gets a single GPU and a unique dp_rank
                 dp_rank = 0
+                # Allocate a unique DP RPC port for this endpoint's leader node
+                dp_rpc_port = port_allocator.next_dp_rpc_port(endpoint.leader_node)
+                # Allocate a single NIXL base port for this endpoint.
+                # vLLM internally computes: actual_port = base + data_parallel_rank
+                # so all DP ranks in the endpoint share the same base port.
+                dp_size = self._get_dp_size(endpoint.mode) or len(endpoint.gpu_indices)
+                nixl_base_port = port_allocator.next_nixl_port_block(dp_size)
                 for _node_rank, node in enumerate(endpoint.nodes):
                     for gpu_idx in sorted(endpoint.gpu_indices):
                         is_leader = dp_rank == 0
@@ -259,7 +303,7 @@ def endpoints_to_processes(
                             else None
                         )
                         kv_events_port = port_allocator.next_kv_events_port()
-                        nixl_port = port_allocator.next_nixl_port()
+                        nixl_port = nixl_base_port
 
                         processes.append(
                             Process(
@@ -273,6 +317,7 @@ def endpoints_to_processes(
                                 bootstrap_port=bootstrap_port,
                                 kv_events_port=kv_events_port,
                                 nixl_port=nixl_port,
+                                dp_rpc_port=dp_rpc_port,
                             )
                         )
                         current_sys_port += 1
@@ -356,7 +401,16 @@ def build_worker_command(
             # DP+EP mode: each GPU runs its own process
             # process.node_rank is the dp_rank (set in endpoints_to_processes)
             dp_rank = process.node_rank
-            dp_rpc_port = config.pop("data-parallel-rpc-port", None) or config.pop("data_parallel_rpc_port", 13345)
+            # Use the per-endpoint dp_rpc_port allocated by NodePortAllocator
+            # (avoids port collisions when multiple endpoints share a node)
+            dp_rpc_port = (
+                process.dp_rpc_port
+                or config.pop("data-parallel-rpc-port", None)
+                or config.pop("data_parallel_rpc_port", 13345)
+            )
+            # Pop from config so it doesn't get added again by _config_to_cli_args
+            config.pop("data-parallel-rpc-port", None)
+            config.pop("data_parallel_rpc_port", None)
 
             cmd.extend(
                 [
diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
index 96c90462..e0dd39c7 100644
--- a/src/srtctl/cli/do_sweep.py
+++ b/src/srtctl/cli/do_sweep.py
@@ -76,6 +76,7 @@ def endpoints(self) -> list[Endpoint]:
             gpus_per_agg=r.gpus_per_agg,
             gpus_per_node=r.gpus_per_node,
             available_nodes=self.runtime.nodes.worker,
+            spread_workers=r.spread_workers,
         )
 
     @functools.cached_property
diff --git a/src/srtctl/cli/submit.py b/src/srtctl/cli/submit.py
index 21f26d9f..39325c1b 100644
--- a/src/srtctl/cli/submit.py
+++ b/src/srtctl/cli/submit.py
@@ -197,7 +197,7 @@ def generate_minimal_sbatch_script(
     env = Environment(loader=FileSystemLoader(str(template_dir)))
     template = env.get_template("job_script_minimal.j2")
 
-    total_nodes = config.resources.total_nodes
+    total_nodes = config.total_nodes
     # Add extra node for dedicated etcd/nats infrastructure
     if config.infra.etcd_nats_dedicated_node:
         total_nodes += 1
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index c535be39..fd59e237 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -396,6 +396,11 @@ class ResourceConfig:
     agg_nodes: int | None = None
     agg_workers: int | None = None
 
+    # If True, place each partial-node worker on its own node instead of
+    # packing multiple onto the same node. Caller must reserve enough nodes
+    # (e.g. set decode_nodes=decode_workers when gpus_per_decode<gpus_per_node).
+    spread_workers: bool = False
+
     # Explicit GPUs per worker (override computed values)
     # Use data_key to map from YAML field names to internal attribute names
     _explicit_gpus_per_prefill: int | None = field(
@@ -934,6 +939,21 @@ def served_model_name(self) -> str:
         default = Path(self.model.path).name
         return self.backend.get_served_model_name(default)
 
+    @property
+    def total_nodes(self) -> int:
+        """Worker node count, adjusted for backend-specific packing."""
+        if isinstance(self.backend, VLLMProtocol) and self.backend.should_colocate_prefill_decode(
+            num_prefill=self.resources.num_prefill,
+            num_decode=self.resources.num_decode,
+            num_agg=self.resources.num_agg,
+            gpus_per_prefill=self.resources.gpus_per_prefill,
+            gpus_per_decode=self.resources.gpus_per_decode,
+            gpus_per_agg=self.resources.gpus_per_agg,
+            gpus_per_node=self.resources.gpus_per_node,
+        ):
+            return 1
+        return self.resources.total_nodes
+
     @property
     def backend_type(self) -> str:
         """Get the backend type string."""
diff --git a/src/srtctl/core/topology.py b/src/srtctl/core/topology.py
index f2a24e5d..1ec4bf24 100644
--- a/src/srtctl/core/topology.py
+++ b/src/srtctl/core/topology.py
@@ -35,8 +35,9 @@ class NodePortAllocator:
     assignments per node and hands out the next available port.
 
     Port ranges (non-overlapping):
-        - kv_events_port: 5550+  (global) - ZMQ port for kv-events publishing
-        - nixl_port:      6550+  (global) - NIXL side channel for KV transfers (vLLM)
+        - kv_events_port: 20000+ (global) - ZMQ port for kv-events publishing
+        - nixl_port:      21000+ (global) - NIXL side channel for KV transfers (vLLM)
+        - dp_rpc_port:    13345+ (per node) - DP coordination port (vLLM data-parallel)
         - http_port:      30000+ (per node) - HTTP serving port
         - bootstrap_port: 31000+ (per node) - P/D coordination port (prefill only)
 
@@ -53,11 +54,13 @@ class NodePortAllocator:
 
     base_http_port: int = 30000
     base_bootstrap_port: int = 31000
-    base_kv_events_port: int = 5550
-    base_nixl_port: int = 6550  # NIXL side channel ports (must not overlap with kv_events)
+    base_kv_events_port: int = 20000
+    base_nixl_port: int = 21000  # NIXL side channel ports (must not overlap with kv_events)
+    base_dp_rpc_port: int = 13345  # DP coordination port for vLLM data-parallel
 
     _http_ports: dict[str, int] = field(default_factory=dict, repr=False)
     _bootstrap_ports: dict[str, int] = field(default_factory=dict, repr=False)
+    _dp_rpc_ports: dict[str, int] = field(default_factory=dict, repr=False)
     _next_kv_events_port: int = field(default=0, repr=False)  # Global counter
     _next_nixl_port: int = field(default=0, repr=False)  # Global counter for NIXL
 
@@ -66,7 +69,7 @@ def next_http_port(self, node: str) -> int:
         if node not in self._http_ports:
             self._http_ports[node] = self.base_http_port
         port = self._http_ports[node]
-        self._http_ports[node] += 1000
+        self._http_ports[node] += 1
         return port
 
     def next_bootstrap_port(self, node: str) -> int:
@@ -93,6 +96,32 @@ def next_nixl_port(self) -> int:
         self._next_nixl_port += 1
         return port
 
+    def next_nixl_port_block(self, size: int) -> int:
+        """Reserve a block of consecutive NIXL ports, return the base port.
+
+        Used in DP mode where vLLM computes:
+            actual_port = VLLM_NIXL_SIDE_CHANNEL_PORT + data_parallel_rank
+        All DP ranks within an endpoint share the same base port, so we
+        must reserve `size` ports to avoid collisions with other endpoints.
+        """
+        if self._next_nixl_port == 0:
+            self._next_nixl_port = self.base_nixl_port
+        port = self._next_nixl_port
+        self._next_nixl_port += size
+        return port
+
+    def next_dp_rpc_port(self, node: str) -> int:
+        """Get next available DP RPC port for a node.
+
+        When multiple DP endpoints share a node, each needs a unique
+        data-parallel-rpc-port to avoid bind collisions.
+        """
+        if node not in self._dp_rpc_ports:
+            self._dp_rpc_ports[node] = self.base_dp_rpc_port
+        port = self._dp_rpc_ports[node]
+        self._dp_rpc_ports[node] += 1
+        return port
+
 
 @dataclass(frozen=True)
 class Endpoint:
@@ -167,6 +196,7 @@ class Process:
     bootstrap_port: int | None = None
     kv_events_port: int | None = None
     nixl_port: int | None = None
+    dp_rpc_port: int | None = None
 
     @property
     def is_leader(self) -> bool:
@@ -188,6 +218,8 @@ def allocate_endpoints(
     gpus_per_agg: int,
     gpus_per_node: int,
     available_nodes: Sequence[str],
+    spread_workers: bool = False,
+    allow_prefill_decode_colocation: bool = False,
 ) -> list[Endpoint]:
     """Allocate endpoints to nodes based on GPU requirements.
 
@@ -202,6 +234,11 @@ def allocate_endpoints(
         gpus_per_agg: GPUs per agg worker
         gpus_per_node: GPUs available per node
         available_nodes: List of available node hostnames
+        spread_workers: If True, place each partial-node worker on its own
+            node instead of packing multiple onto the same node. Requires the
+            caller to reserve enough nodes (one per worker per mode).
+        allow_prefill_decode_colocation: If True, decode workers may use
+            remaining GPUs on a node already used by prefill workers.
 
     Returns:
         List of Endpoint objects with node assignments
@@ -326,7 +363,7 @@ def allocate_workers_simple(mode: WorkerMode, count: int, gpus_per_worker: int)
                 gpu_indices = frozenset(range(gpu_offset, gpu_offset + gpus_per_worker))
                 gpu_offset += gpus_per_worker
 
-                if gpu_offset >= gpus_per_node:
+                if gpu_offset >= gpus_per_node or spread_workers:
                     node_idx += 1
                     gpu_offset = 0
 
@@ -346,13 +383,13 @@ def allocate_workers_simple(mode: WorkerMode, count: int, gpus_per_worker: int)
     if num_prefill > 0:
         endpoints.extend(allocate_workers_simple("prefill", num_prefill, gpus_per_prefill))
 
-    # When there's a partial allocation on the current node (gpu_offset > 0) and
-    # there are more nodes available, advance to ensure prefill and decode don't
-    # share a node. This prevents the bug where a multi-node decode worker overlaps
-    # with a partial-node prefill worker.
+    # By default, when there's a partial allocation on the current node
+    # (gpu_offset > 0) and there are more nodes available, advance to ensure
+    # prefill and decode don't share a node. This prevents the bug where a
+    # multi-node decode worker overlaps with a partial-node prefill worker.
     # When there are no more nodes (decode_nodes=0 config), allow sharing.
     if num_decode > 0:
-        if gpu_offset > 0 and (node_idx + 1) < len(available_nodes):
+        if not allow_prefill_decode_colocation and gpu_offset > 0 and (node_idx + 1) < len(available_nodes):
             node_idx += 1
             gpu_offset = 0
         endpoints.extend(allocate_workers_simple("decode", num_decode, gpus_per_decode))
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 0b4138d5..ea3a87ce 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -891,6 +891,226 @@ def test_sbatch_normal_node_count_without_dedicated_infra(self):
         # Should request 2 nodes: just the workers
         assert "#SBATCH --nodes=2" in script
 
+    def test_vllm_colocation_reduces_sbatch_to_one_node_when_fit(self):
+        """Test vLLM P/D colocation requests one worker node when all workers fit."""
+        from pathlib import Path
+
+        from srtctl.backends import VLLMProtocol
+        from srtctl.cli.submit import generate_minimal_sbatch_script
+        from srtctl.core.schema import InfraConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"),
+            resources=ResourceConfig(
+                gpu_type="h100",
+                gpus_per_node=8,
+                prefill_nodes=1,
+                decode_nodes=1,
+                prefill_workers=1,
+                decode_workers=1,
+                _explicit_gpus_per_prefill=4,
+                _explicit_gpus_per_decode=4,
+            ),
+            backend=VLLMProtocol(allow_prefill_decode_colocation=True),
+            infra=InfraConfig(etcd_nats_dedicated_node=False),
+        )
+
+        assert config.resources.total_nodes == 2
+        assert config.total_nodes == 1
+
+        script = generate_minimal_sbatch_script(config, Path("/tmp/test.yaml"))
+
+        assert "#SBATCH --nodes=1" in script
+
+    def test_vllm_colocation_keeps_normal_node_count_when_not_fit(self):
+        """Test vLLM P/D colocation does not reduce nodes when workers exceed one node."""
+        from srtctl.backends import VLLMProtocol
+        from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"),
+            resources=ResourceConfig(
+                gpu_type="h100",
+                gpus_per_node=8,
+                prefill_nodes=1,
+                decode_nodes=1,
+                prefill_workers=1,
+                decode_workers=1,
+                _explicit_gpus_per_prefill=6,
+                _explicit_gpus_per_decode=4,
+            ),
+            backend=VLLMProtocol(allow_prefill_decode_colocation=True),
+        )
+
+        assert config.total_nodes == 2
+
+
+class TestVLLMPrefillDecodeColocation:
+    """Tests for vLLM prefill/decode same-node packing."""
+
+    def test_disabled_by_default_keeps_prefill_and_decode_separate(self):
+        """Test vLLM preserves default P/D node separation."""
+        from srtctl.backends import VLLMProtocol
+
+        endpoints = VLLMProtocol().allocate_endpoints(
+            num_prefill=1,
+            num_decode=1,
+            num_agg=0,
+            gpus_per_prefill=4,
+            gpus_per_decode=4,
+            gpus_per_agg=0,
+            gpus_per_node=8,
+            available_nodes=("node0", "node1"),
+        )
+
+        assert endpoints[0].mode == "prefill"
+        assert endpoints[0].nodes == ("node0",)
+        assert endpoints[1].mode == "decode"
+        assert endpoints[1].nodes == ("node1",)
+
+    def test_colocation_requires_prefill_decode_and_valid_node_size(self):
+        """Test vLLM colocation stays off for incomplete or invalid P/D topology."""
+        from srtctl.backends import VLLMProtocol
+
+        backend = VLLMProtocol(allow_prefill_decode_colocation=True)
+
+        for num_prefill, num_decode, gpus_per_node in ((0, 1, 8), (1, 0, 8), (1, 1, 0)):
+            assert not backend.should_colocate_prefill_decode(
+                num_prefill=num_prefill,
+                num_decode=num_decode,
+                num_agg=0,
+                gpus_per_prefill=4,
+                gpus_per_decode=4,
+                gpus_per_agg=0,
+                gpus_per_node=gpus_per_node,
+            )
+
+    def test_enabled_packs_prefill_and_decode_when_one_node_fits(self):
+        """Test vLLM packs P/D workers together when requested and all fit."""
+        from srtctl.backends import VLLMProtocol
+
+        endpoints = VLLMProtocol(allow_prefill_decode_colocation=True).allocate_endpoints(
+            num_prefill=2,
+            num_decode=2,
+            num_agg=0,
+            gpus_per_prefill=2,
+            gpus_per_decode=2,
+            gpus_per_agg=0,
+            gpus_per_node=8,
+            available_nodes=("node0", "node1"),
+        )
+
+        prefill_eps = [ep for ep in endpoints if ep.mode == "prefill"]
+        decode_eps = [ep for ep in endpoints if ep.mode == "decode"]
+
+        assert [ep.nodes for ep in prefill_eps] == [("node0",), ("node0",)]
+        assert [ep.gpu_indices for ep in prefill_eps] == [frozenset({0, 1}), frozenset({2, 3})]
+        assert [ep.nodes for ep in decode_eps] == [("node0",), ("node0",)]
+        assert [ep.gpu_indices for ep in decode_eps] == [frozenset({4, 5}), frozenset({6, 7})]
+
+    def test_same_node_prefill_decode_ports_do_not_collide(self):
+        """Test same-node vLLM P/D workers get distinct listener ports."""
+        from srtctl.backends import VLLMProtocol
+
+        backend = VLLMProtocol(allow_prefill_decode_colocation=True)
+        endpoints = backend.allocate_endpoints(
+            num_prefill=1,
+            num_decode=1,
+            num_agg=0,
+            gpus_per_prefill=4,
+            gpus_per_decode=4,
+            gpus_per_agg=0,
+            gpus_per_node=8,
+            available_nodes=("node0", "node1"),
+        )
+
+        processes = backend.endpoints_to_processes(endpoints)
+        prefill = next(p for p in processes if p.endpoint_mode == "prefill")
+        decode = next(p for p in processes if p.endpoint_mode == "decode")
+
+        assert prefill.node == decode.node == "node0"
+        assert prefill.http_port == 30000
+        assert decode.http_port == 30001
+        assert prefill.bootstrap_port == 31000
+
+        bound_ports = [
+            port
+            for process in processes
+            for port in (process.http_port, process.bootstrap_port, process.kv_events_port, process.nixl_port)
+            if port
+        ]
+        assert len(bound_ports) == len(set(bound_ports))
+
+    def test_same_node_dp_prefill_decode_ports_do_not_collide(self):
+        """Test same-node DP P/D endpoints get distinct per-endpoint port ranges."""
+        from srtctl.backends import VLLMProtocol, VLLMServerConfig
+
+        backend = VLLMProtocol(
+            allow_prefill_decode_colocation=True,
+            vllm_config=VLLMServerConfig(
+                prefill={"data-parallel-size": 4, "enable-expert-parallel": True},
+                decode={"data-parallel-size": 4, "enable-expert-parallel": True},
+            ),
+        )
+        endpoints = backend.allocate_endpoints(
+            num_prefill=1,
+            num_decode=1,
+            num_agg=0,
+            gpus_per_prefill=4,
+            gpus_per_decode=4,
+            gpus_per_agg=0,
+            gpus_per_node=8,
+            available_nodes=("node0", "node1"),
+        )
+
+        processes = backend.endpoints_to_processes(endpoints)
+        prefill = [p for p in processes if p.endpoint_mode == "prefill"]
+        decode = [p for p in processes if p.endpoint_mode == "decode"]
+
+        assert len(prefill) == 4
+        assert len(decode) == 4
+        assert {p.node for p in prefill + decode} == {"node0"}
+        assert {p.dp_rpc_port for p in prefill} == {13345}
+        assert {p.dp_rpc_port for p in decode} == {13346}
+        assert {p.nixl_port for p in prefill} == {21000}
+        assert {p.nixl_port for p in decode} == {21004}
+
+        leader_ports = [
+            port
+            for process in prefill + decode
+            for port in (process.http_port, process.bootstrap_port)
+            if port
+        ]
+        assert sorted(leader_ports) == [30000, 30001, 31000]
+
+        prefill_actual_nixl_ports = {next(iter(p.nixl_port for p in prefill)) + p.node_rank for p in prefill}
+        decode_actual_nixl_ports = {next(iter(p.nixl_port for p in decode)) + p.node_rank for p in decode}
+        assert prefill_actual_nixl_ports == {21000, 21001, 21002, 21003}
+        assert decode_actual_nixl_ports == {21004, 21005, 21006, 21007}
+        assert prefill_actual_nixl_ports.isdisjoint(decode_actual_nixl_ports)
+
+    def test_enabled_does_not_pack_when_one_node_does_not_fit(self):
+        """Test vLLM falls back to separated P/D nodes when total GPUs do not fit."""
+        from srtctl.backends import VLLMProtocol
+
+        endpoints = VLLMProtocol(allow_prefill_decode_colocation=True).allocate_endpoints(
+            num_prefill=1,
+            num_decode=1,
+            num_agg=0,
+            gpus_per_prefill=6,
+            gpus_per_decode=4,
+            gpus_per_agg=0,
+            gpus_per_node=8,
+            available_nodes=("node0", "node1"),
+        )
+
+        assert endpoints[0].mode == "prefill"
+        assert endpoints[0].nodes == ("node0",)
+        assert endpoints[1].mode == "decode"
+        assert endpoints[1].nodes == ("node1",)
+
 
 class TestVLLMDataParallelMode:
     """Tests for vLLM DP+EP (Data Parallel + Expert Parallel) mode."""
@@ -965,6 +1185,46 @@ def test_dp_mode_creates_per_gpu_processes(self):
         dp_ranks = [p.node_rank for p in processes]
         assert dp_ranks == list(range(16))
 
+    def test_dp_mode_allocates_unique_ports_for_multiple_endpoints_per_node(self):
+        """Test DP endpoints sharing a node get non-colliding coordination ports."""
+        from srtctl.backends import VLLMProtocol, VLLMServerConfig
+        from srtctl.core.topology import Endpoint
+
+        backend = VLLMProtocol(
+            vllm_config=VLLMServerConfig(
+                decode={"data-parallel-size": 4, "enable-expert-parallel": True},
+            )
+        )
+
+        endpoints = [
+            Endpoint(
+                mode="decode",
+                index=0,
+                nodes=("node0",),
+                gpu_indices=frozenset(range(4)),
+                gpus_per_node=8,
+            ),
+            Endpoint(
+                mode="decode",
+                index=1,
+                nodes=("node0",),
+                gpu_indices=frozenset(range(4, 8)),
+                gpus_per_node=8,
+            ),
+        ]
+
+        processes = backend.endpoints_to_processes(endpoints)
+
+        first_endpoint = [p for p in processes if p.endpoint_index == 0]
+        second_endpoint = [p for p in processes if p.endpoint_index == 1]
+
+        assert {p.dp_rpc_port for p in first_endpoint} == {13345}
+        assert {p.dp_rpc_port for p in second_endpoint} == {13346}
+        assert {p.nixl_port for p in first_endpoint} == {21000}
+        assert {p.nixl_port for p in second_endpoint} == {21004}
+        assert [p.node_rank for p in first_endpoint] == list(range(4))
+        assert [p.node_rank for p in second_endpoint] == list(range(4))
+
     def test_dp_mode_command_includes_dp_flags(self):
         """Test that DP mode command includes correct DP flags instead of TP flags."""
         from pathlib import Path
diff --git a/tests/test_endpoint_allocation.py b/tests/test_endpoint_allocation.py
index 1625e6b7..6674ae87 100644
--- a/tests/test_endpoint_allocation.py
+++ b/tests/test_endpoint_allocation.py
@@ -137,6 +137,45 @@ def test_aggregated_mode(self):
             assert ep.mode == "agg"
             assert ep.total_gpus == 4
 
+    def test_spread_workers_partial_node(self):
+        """spread_workers=True forces each partial-node worker onto its own node."""
+        endpoints = allocate_endpoints(
+            num_prefill=1,
+            num_decode=2,
+            num_agg=0,
+            gpus_per_prefill=1,
+            gpus_per_decode=2,
+            gpus_per_agg=0,
+            gpus_per_node=4,
+            available_nodes=("node0", "node1", "node2"),
+            spread_workers=True,
+        )
+
+        decode_eps = [e for e in endpoints if e.mode == "decode"]
+        assert len(decode_eps) == 2
+        # Without spread_workers both decode workers would land on node1.
+        assert decode_eps[0].nodes == ("node1",)
+        assert decode_eps[1].nodes == ("node2",)
+        assert decode_eps[0].gpu_indices == frozenset({0, 1})
+        assert decode_eps[1].gpu_indices == frozenset({0, 1})
+
+    def test_spread_workers_default_packs(self):
+        """spread_workers=False (default) packs partial-node workers onto the same node."""
+        endpoints = allocate_endpoints(
+            num_prefill=0,
+            num_decode=2,
+            num_agg=0,
+            gpus_per_prefill=0,
+            gpus_per_decode=2,
+            gpus_per_agg=0,
+            gpus_per_node=4,
+            available_nodes=("node0", "node1"),
+        )
+
+        decode_eps = [e for e in endpoints if e.mode == "decode"]
+        assert decode_eps[0].nodes == ("node0",)
+        assert decode_eps[1].nodes == ("node0",)
+
     def test_prefill_decode_never_share_node_partial_allocation(self):
         """Test that prefill and decode workers are never colocated on the same node.
 
@@ -317,9 +356,9 @@ def test_kv_events_port_allocation(self):
         assert all(port is not None for port in kv_ports), "All processes should have kv_events_port"
         assert len(kv_ports) == len(set(kv_ports)), "All kv_events_ports should be globally unique"
 
-        # Ports should be sequential starting from 5550
+        # Ports should be sequential starting from 20000
         # With 2 prefill + 2 decode workers, each on single node = 4 processes = 4 ports
-        assert sorted(kv_ports) == [5550, 5551, 5552, 5553]
+        assert sorted(kv_ports) == [20000, 20001, 20002, 20003]
 
     def test_kv_events_port_same_node_unique(self):
         """Test kv_events_port is unique even when workers share a node."""
@@ -341,11 +380,11 @@ def test_kv_events_port_same_node_unique(self):
         assert len(processes) == 2
         assert processes[0].node == processes[1].node == "node0"
         assert processes[0].kv_events_port != processes[1].kv_events_port
-        assert processes[0].kv_events_port == 5550
-        assert processes[1].kv_events_port == 5551
+        assert processes[0].kv_events_port == 20000
+        assert processes[1].kv_events_port == 20001
 
     def test_nixl_port_allocation(self):
-        """Test NIXL ports are allocated globally unique starting at 6550."""
+        """Test NIXL ports are allocated globally unique starting at 21000."""
         from srtctl.core.topology import Endpoint
 
         endpoints = [
@@ -371,5 +410,5 @@ def test_nixl_port_allocation(self):
         nixl_ports = [p.nixl_port for p in processes]
         assert all(port is not None for port in nixl_ports), "All processes should have nixl_port"
         assert len(nixl_ports) == len(set(nixl_ports))  # All unique
-        assert min(nixl_ports) == 6550  # Starts at base
-        assert nixl_ports == [6550, 6551]  # Sequential
+        assert min(nixl_ports) == 21000  # Starts at base
+        assert nixl_ports == [21000, 21001]  # Sequential