From 9cc6d50d23ac7d346918390592b6f9d919508f66 Mon Sep 17 00:00:00 2001 From: "Albert Cheng (Engrg-Hardware 1)" Date: Wed, 1 Apr 2026 13:35:46 -0700 Subject: [PATCH 01/14] Make Dynamo source install container-agnostic (vLLM, SGLang, etc.) Auto-detect container type at runtime: if /sgl-workspace exists (SGLang), use original install path unchanged; otherwise use portable /tmp build path with conditional dependency installation for non-SGLang containers. --- src/srtctl/core/schema.py | 32 ++++++++++++++++++++++++++++++-- tests/test_configs.py | 10 +++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index 97547fec..085db6c8 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -719,8 +719,8 @@ def get_install_commands(self) -> str: git_ref = self.hash if self.hash else "HEAD" checkout_cmd = f"git checkout {self.hash}" if self.hash else "" - return ( - f"echo 'Installing dynamo from source ({git_ref})...' && " + # Original SGLang container path, UNCHANGED + sglang = ( "apt-get update -qq && apt-get install -y -qq libclang-dev > /dev/null 2>&1 && " "cd /sgl-workspace/ && " "git clone https://github.com/ai-dynamo/dynamo.git && " @@ -736,6 +736,34 @@ def get_install_commands(self) -> str: f"echo 'Dynamo installed from source ({git_ref})'" ) + # Portable path for non-SGLang containers (vLLM, etc.) + portable = ( + "if ! command -v cargo &> /dev/null || ! command -v maturin &> /dev/null; then " + "apt-get update -qq && apt-get install -y -qq git curl libclang-dev protobuf-compiler > /dev/null 2>&1 && " + "if ! command -v cargo &> /dev/null; then " + "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && source $HOME/.cargo/env; fi && " + "if ! command -v maturin &> /dev/null; then " + "pip install --break-system-packages maturin; fi; fi && " + "ORIG_DIR=$(pwd) && rm -rf /tmp/dynamo_build && mkdir -p /tmp/dynamo_build && cd /tmp/dynamo_build && " + "git clone https://github.com/ai-dynamo/dynamo.git && " + "cd dynamo && " + f"{checkout_cmd + ' && ' if checkout_cmd else ''}" + "cd lib/bindings/python/ && " + 'export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" && ' + "rm -f /tmp/ai_dynamo_runtime*.whl && " + "maturin build -o /tmp && " + "pip install --break-system-packages /tmp/ai_dynamo_runtime*.whl --force-reinstall && " + "cd /tmp/dynamo_build/dynamo/ && " + "pip install --break-system-packages -e . && " + "cd $ORIG_DIR && " + f"echo 'Dynamo installed from source ({git_ref})'" + ) + + return ( + f"echo 'Installing dynamo from source ({git_ref})...' && " + f"if [ -d /sgl-workspace ]; then {sglang}; else {portable}; fi" + ) + Schema: ClassVar[type[Schema]] = Schema diff --git a/tests/test_configs.py b/tests/test_configs.py index 1c23fb30..b1ef1736 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -127,7 +127,11 @@ def test_hash_install_command(self): assert "git clone" in cmd assert "git checkout abc123" in cmd assert "maturin build" in cmd - assert "pip install -e" in cmd + assert "if [ -d /sgl-workspace ]" in cmd + assert "/tmp/dynamo_build" in cmd + assert "protobuf-compiler" in cmd + assert "if ! command -v cargo" in cmd + assert "if ! command -v maturin" in cmd def test_top_of_tree_install_command(self): """Top-of-tree config generates source install without checkout.""" @@ -140,6 +144,10 @@ def test_top_of_tree_install_command(self): assert "git clone" in cmd assert "git checkout" not in cmd assert "maturin build" in cmd + assert "if [ -d /sgl-workspace ]" in cmd + assert "/tmp/dynamo_build" in cmd + assert "--break-system-packages" in cmd + assert "--force-reinstall" in cmd def test_hash_and_top_of_tree_not_allowed(self): """Cannot specify both hash and top_of_tree.""" From 8294e64ee2eefa075c2502a62e19e8cd8e6ca23a Mon Sep 17 00:00:00 2001 From: nlevin-ui Date: Mon, 6 Apr 2026 17:19:27 -0600 Subject: [PATCH 02/14] Add Kimi-K2.5 vLLM recipes and fix NIXL side channel host (#11) * Add Kimi-K2.5 vLLM recipes and fix NIXL side channel host - Add kimi-k2.5 1k1k and 8k1k disagg GB200 recipes (from NVIDIA/srt-slurm#7) - Fix vLLM NIXL handshake failures: set VLLM_NIXL_SIDE_CHANNEL_HOST to node's routable IP in get_process_environment() instead of leaving it as 0.0.0.0/localhost which caused transfer handshake failures - Update test_vllm_get_process_environment to cover NIXL host env var Co-Authored-By: Claude Sonnet 4.6 * ci: run checks on PRs targeting sa-submission-q2-2026 Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- .github/workflows/ci.yaml | 3 +- .../1k1k/disagg-gb200-1p1d-dep4-dep16.yaml | 101 ++++++++++++++++++ .../1k1k/disagg-gb200-1p4d-dep4-tep4.yaml | 98 +++++++++++++++++ .../8k1k/disagg-gb200-1p4d-dep4-tep4.yaml | 98 +++++++++++++++++ .../8k1k/disagg-gb200-3p1d-dep4-dep16.yaml | 101 ++++++++++++++++++ .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 101 ++++++++++++++++++ .../8k1k/disagg-gb200-6p1d-dep4-dep16.yaml | 101 ++++++++++++++++++ src/srtctl/backends/vllm.py | 4 + tests/test_configs.py | 6 +- 9 files changed, 611 insertions(+), 2 deletions(-) create mode 100644 recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml create mode 100644 recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml create mode 100644 recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index eba897bb..dccdba05 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, sa-submission-q2-2026] jobs: lint: @@ -119,3 +119,4 @@ jobs: exit(1) print(f'\nAll {len(recipes)} recipes valid') " + diff --git a/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml new file mode 100644 index 00000000..ecdc9233 --- /dev/null +++ b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-1p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 4096 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 4096 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024x2048x3072x4096" + req_rate: "inf" diff --git a/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml new file mode 100644 index 00000000..43167b5f --- /dev/null +++ b/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml @@ -0,0 +1,98 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml new file mode 100644 index 00000000..1ab6ca27 --- /dev/null +++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml @@ -0,0 +1,98 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 16 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x128" + req_rate: "inf" diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml new file mode 100644 index 00000000..ca4e9813 --- /dev/null +++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 256 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml new file mode 100644 index 00000000..cd9f94a9 --- /dev/null +++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml new file mode 100644 index 00000000..47d3d7ee --- /dev/null +++ b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-6p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072x4096" + req_rate: "inf" diff --git a/src/srtctl/backends/vllm.py b/src/srtctl/backends/vllm.py index ff20cb40..1acbd50c 100644 --- a/src/srtctl/backends/vllm.py +++ b/src/srtctl/backends/vllm.py @@ -132,12 +132,16 @@ def get_process_environment(self, process: Process) -> dict[str, str]: vLLM with dynamo requires unique ports for each worker: - DYN_VLLM_KV_EVENT_PORT: ZMQ port for KV events publishing - VLLM_NIXL_SIDE_CHANNEL_PORT: Port for NIXL side channel transfers + - VLLM_NIXL_SIDE_CHANNEL_HOST: Routable IP for NIXL side channel (not 0.0.0.0/localhost) """ + from srtctl.core.slurm import get_hostname_ip + env: dict[str, str] = {} if process.kv_events_port is not None: env["DYN_VLLM_KV_EVENT_PORT"] = str(process.kv_events_port) if process.nixl_port is not None: env["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(process.nixl_port) + env["VLLM_NIXL_SIDE_CHANNEL_HOST"] = get_hostname_ip(process.node) return env def get_served_model_name(self, default: str) -> str: diff --git a/tests/test_configs.py b/tests/test_configs.py index b1ef1736..86d79cdb 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1080,6 +1080,8 @@ def test_standard_tp_mode_still_works(self): def test_vllm_get_process_environment(self): """Test vLLM sets port environment variables from process.""" + from unittest.mock import patch + from srtctl.backends import VLLMProtocol from srtctl.core.topology import Process @@ -1098,10 +1100,12 @@ def test_vllm_get_process_environment(self): nixl_port=6550, ) - env = backend.get_process_environment(process) + with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"): + env = backend.get_process_environment(process) assert env["DYN_VLLM_KV_EVENT_PORT"] == "5550" assert env["VLLM_NIXL_SIDE_CHANNEL_PORT"] == "6550" + assert env["VLLM_NIXL_SIDE_CHANNEL_HOST"] == "10.0.0.1" def test_vllm_get_process_environment_none_ports(self): """Test vLLM handles None ports gracefully.""" From 94903bdb6352b048305b407d28fea7e0f4ae2f65 Mon Sep 17 00:00:00 2001 From: Yeswanth koti Date: Fri, 10 Apr 2026 01:54:10 -0400 Subject: [PATCH 03/14] =?UTF-8?q?Add=20Kimi=20K2.5=20disagg=20STP=20and=20?= =?UTF-8?q?MTP=20recipes=20for=20GB200=20NVfp4=20(ISL8K=5FOSL1K=E2=80=A6?= =?UTF-8?q?=20(#24)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Kimi K2.5 disagg STP and MTP recipes for GB200 NVfp4 (ISL8K_OSL1K and ISL1K_OSL1K) Add optimized disaggregated inference recipes for Kimi K2.5 model with NVfp4 precision on GB200 GPUs. Includes both STP and MTP configurations for ISL8K_OSL1K and ISL1K_OSL1K workloads covering concurrency points from 5 to 2253, with Eagle speculative decoding for MTP variants. * Update Kimi K2.5 recipes: container, model path, concurrency format, and env cleanup - Update container to tensorrtllm-runtime-1.1.0-dev.2.sqsh - Point model path to shared /mnt/lustre01/models/kimi-k2.5-nvfp4 - Update Eagle model mount path for MTP configs - Remove HF_HOME (defaults to ~/.cache/huggingface) - Fix concurrency separator from space to 'x' for sa-bench compatibility - Enable multiple frontends for ctx1dep4_gen1dep32_batch64 * Use generic model path and container aliases for cluster portability Replace cluster-specific paths with generic alias names that are resolved via srtslurm.yaml model_paths and containers mappings, as per upstream convention. * Add extra_mount alias resolution and use generic Eagle model path Add model_paths alias resolution for extra_mount host paths in config.py, enabling MTP recipes to use generic name "kimi-k2.5-eagle3" instead of cluster-specific path for the Eagle speculative decoding model. * Use HuggingFace model names and full NVCR container paths Per review feedback, update model paths to HuggingFace format (nvidia/Kimi-K2.5-NVFP4) and container to full NVCR registry path (nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2) so recipes are portable and work without pre-built sqsh files. --------- Co-authored-by: nlevin-ui --- ...ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml | 136 +++++++++++ ...ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml | 134 +++++++++++ ...ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml | 196 ++++++++++++++++ ...4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml | 141 ++++++++++++ ...p4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml | 132 +++++++++++ ...tx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml | 148 ++++++++++++ ...ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml | 140 +++++++++++ ...ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml | 164 +++++++++++++ ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 125 ++++++++++ ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 129 +++++++++++ ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 217 ++++++++++++++++++ ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 138 +++++++++++ ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 122 ++++++++++ ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 153 ++++++++++++ ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 137 +++++++++++ .../ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml | 137 +++++++++++ .../ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml | 133 +++++++++++ ...p4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml | 133 +++++++++++ .../ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml | 133 +++++++++++ ...ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml | 134 +++++++++++ ..._gen1dep8_batch256_allconc_eplb0_mtp1.yaml | 164 +++++++++++++ ...ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml | 136 +++++++++++ ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 126 ++++++++++ ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 123 ++++++++++ ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 126 ++++++++++ ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 124 ++++++++++ ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 126 ++++++++++ ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 155 +++++++++++++ ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 138 +++++++++++ src/srtctl/core/config.py | 14 ++ 30 files changed, 4114 insertions(+) create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml create mode 100644 recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..03462b07 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,136 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# MTP (Eagle speculative decoding, max_draft_len=3) +# concurrency: 666 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..6a29059c --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml @@ -0,0 +1,134 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# MTP (Eagle speculative decoding, max_draft_len=3) +# concurrency: 666 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml new file mode 100644 index 00000000..739bd487 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml @@ -0,0 +1,196 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch512_eplb0_mtp1" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=512 +# MTP (Eagle speculative decoding, max_draft_len=1) +# concurrency: 4301 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..a768bec4 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml @@ -0,0 +1,141 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=64 +# MTP (Eagle speculative decoding, max_draft_len=3) +# Covers all gen4tep8 concurrencies: 8, 48, 92, 192, 336 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8x48x92x192x336" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..c2e24b41 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml @@ -0,0 +1,132 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=2 +# MTP (Eagle speculative decoding, max_draft_len=3) +# Covers all gen5tep4 concurrencies: 10, 15 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10x15" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml new file mode 100644 index 00000000..68d7dd06 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml @@ -0,0 +1,148 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp1" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# MTP (Eagle speculative decoding, max_draft_len=1) +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..1cb17478 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml @@ -0,0 +1,140 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp1" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 +# MTP (Eagle speculative decoding, max_draft_len=1) +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml new file mode 100644 index 00000000..eb43aab7 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml @@ -0,0 +1,164 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen3dep8_batch256_eplb0_mtp1" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 3 decode workers, TP8/EP8, enable_attention_dp=true, max_batch=256 +# MTP (Eagle speculative decoding, max_draft_len=1) +# concurrency: 6759 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6759" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..ce3eff43 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,125 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# STP (no speculative decoding) +# concurrency: 666 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..105b84bf --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch64_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 +# STP (no speculative decoding) +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..9fb194dd --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml @@ -0,0 +1,217 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=768 +# STP (no speculative decoding) +# Covers all dep8 concurrencies: 4301, 6452 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301x6452" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..5639da41 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml @@ -0,0 +1,138 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=128 +# STP (no speculative decoding) +# Covers all gen4tep8 concurrencies: 4, 192, 360, 668 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x192x360x668" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..f9496feb --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml @@ -0,0 +1,122 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=8 +# STP (no speculative decoding) +# Covers all gen5tep4 concurrencies: 5, 15, 30, 55 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5x15x30x55" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml new file mode 100644 index 00000000..71b016c4 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml @@ -0,0 +1,153 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256 +# STP (no speculative decoding) +# concurrency: 4301 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..52b75bb4 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml @@ -0,0 +1,137 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch128_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 +# STP (no speculative decoding) +# concurrency: 4301 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..bb3f8d1e --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,137 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 2 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=32 +# MTP Eagle speculative decoding, max_draft_len=3 +# concurrency: 90 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "90" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..8b7f02d6 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1 +# MTP Eagle speculative decoding, max_draft_len=3 +# concurrency: 8 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..1883e739 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=8 +# MTP Eagle speculative decoding, max_draft_len=3 +# Covers all gen5tep4 concurrencies: 10, 15, 60 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "10x15x60" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..5aced422 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch8_eplb0_mtp3" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=8 +# MTP Eagle speculative decoding, max_draft_len=3 +# concurrency: 180 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..764f2d46 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml @@ -0,0 +1,134 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp3" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# MTP Eagle speculative decoding, max_draft_len=3 +# concurrency: 666 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml new file mode 100644 index 00000000..31308fe6 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml @@ -0,0 +1,164 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256 +# MTP Eagle speculative decoding, max_draft_len=1 +# Covers all dep8 mtp1 concurrencies: 1229, 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 1 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229x2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..9bd03c05 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml @@ -0,0 +1,136 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp3" + +# ctx: 8 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# MTP Eagle speculative decoding, max_draft_len=3 +# concurrency: 1229 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 8 + prefill_workers: 8 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: "/eagle-model" + +extra_mount: + - "nvidia/Kimi-K2.5-Thinking-Eagle3:/eagle-model" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..8c1f0aa8 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP4/EP4, max_batch=32 +# Single concurrency point: 156 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 4 workers x TP4 = 16 GPUs = 4 nodes + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "156" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..d4c5086b --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml @@ -0,0 +1,123 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1 +# Single concurrency point: 4 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 4 workers x TP8 = 32 GPUs = 8 nodes + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..8f6ea063 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=16 +# Covers all concurrencies: 5, 15, 30, 60, 105 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 5 workers x TP4 = 20 GPUs = 5 nodes + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + # max_batch_size=16 covers all concs: 5, 15, 30, 60, 105 + # cuda_graph pre-compiles graphs for each batch size up to the max + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5x15x30x60x105" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..4bfaa0e2 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,124 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch16_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=16 +# concurrency: 333 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 2 workers x TP4 = 8 GPUs = 2 nodes + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..d7d51627 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep16_batch32_eplb0_mtp0" + +# ctx: 3 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# concurrency: 615 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 3 workers x TP4 = 12 GPUs = 3 nodes + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..e8df1179 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml @@ -0,0 +1,155 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256 +# Single concurrency point: 2151 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 5 workers x TP4 = 20 GPUs = 5 nodes + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP8 = 8 GPUs = 2 nodes + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + # max_batch_size=256, cuda_graph pre-compiles graphs for all batch sizes up to 256 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2151" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..db177892 --- /dev/null +++ b/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,138 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch128_eplb0_mtp0" + +# ctx: 7 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 7 workers x TP4 = 28 GPUs = 7 nodes + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/src/srtctl/core/config.py b/src/srtctl/core/config.py index 8cea4e17..f30fc7fc 100644 --- a/src/srtctl/core/config.py +++ b/src/srtctl/core/config.py @@ -141,6 +141,20 @@ def resolve_config_with_defaults(user_config: dict[str, Any], cluster_config: di config["reporting"] = cluster_config["reporting"] logger.debug("Applied cluster reporting config") + # Resolve extra_mount host path aliases through model_paths + extra_mounts = config.get("extra_mount", []) + if model_paths and extra_mounts: + resolved_mounts = [] + for mount_spec in extra_mounts: + host_path, container_path = mount_spec.split(":", 1) + if host_path in model_paths: + resolved_host = model_paths[host_path] + resolved_mounts.append(f"{resolved_host}:{container_path}") + logger.debug(f"Resolved extra_mount alias '{host_path}' -> '{resolved_host}'") + else: + resolved_mounts.append(mount_spec) + config["extra_mount"] = resolved_mounts + # Resolve frontend nginx_container alias frontend = config.get("frontend", {}) nginx_container = frontend.get("nginx_container", "") From b0f5b83f1949bc5d11059704ae84e263d89424d1 Mon Sep 17 00:00:00 2001 From: Camilo Moreno Date: Mon, 13 Apr 2026 11:20:29 -0700 Subject: [PATCH 04/14] Add Minimax M2.5 NVFP4 agg B200 single-node configs (#36) * recipes for minimax m2.5 fp4 b200 agg vllm * commit for signature --- recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml | 103 +++++++++++++++++++ recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml | 88 ++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml create mode 100644 recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml diff --git a/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml b/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml new file mode 100644 index 00000000..daef7b0d --- /dev/null +++ b/recipes/vllm/minimax-m2.5/b200-fp4/1k1k.yaml @@ -0,0 +1,103 @@ +# MiniMax-M2.5 NVFP4 B200 — 1K/1K ISL/OSL +# Aggregated vLLM, single-node +# requires github.com/NVIDIA/srt-slurm, branch sa-submission-q2-2026 +# usage examples: +# srtctl apply -f 1k1k.yaml # run all variants +# srtctl apply -f 1k1k.yaml:zip_override_lowlat # full lowlat sweep +# srtctl apply -f 1k1k.yaml:zip_override_lowlat[2] # lowlat, tep2 variant only +# srtctl apply -f 1k1k.yaml:zip_override_hightput # full high tput sweep +# srtctl dry-run -f 1k1k.yaml # preview the variants + +base: + name: "minimax-m2.5-nvfp4-b200-1k1k" + + model: + path: "minimax_m2.5_fp4" + container: "vllm/vllm-openai:v0.19.0-cu130" + precision: "fp4" + + resources: + gpu_type: "b200" + gpus_per_node: 8 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 1 + + frontend: + type: dynamo + enable_multiple_frontends: false + + dynamo: + install: true + top_of_tree: true # currently need ToT for vllm 0.19.0 + + setup_script: vllm-container-deps.sh + + backend: + type: vllm + + aggregated_environment: + DYN_HEALTH_CHECK_ENABLED: "false" + PYTHONUNBUFFERED: "1" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + gpu-memory-utilization: 0.90 + max-model-len: 2248 + max-num-batched-tokens: 2048 + kv-cache-dtype: fp8 + max-cudagraph-capture-size: 2048 + stream-interval: 20 + no-enable-prefix-caching: true + trust-remote-code: true + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + + +zip_override_lowlat: + name: + - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tp1" + - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tp2" + - "minimax-m2.5-nvfp4-b200-1k1k-lowlat-tep2" + resources: + gpus_per_agg: [1, 2, 2] + backend: + vllm_config: + aggregated: + tensor-parallel-size: [1, 2, 2] + enable-expert-parallel: [false, false, true] + benchmark: + concurrencies: ["4","4x8x16x32x64x128x256x512","128x256"] + +override_maxtput: + name: "minimax-m2.5-nvfp4-b200-1k1k-maxtput-dep2" + resources: + gpus_per_agg: 2 + backend: + vllm_config: + aggregated: + tensor-parallel-size: 1 + enable-expert-parallel: true + data-parallel-size: 2 + benchmark: + concurrencies: "512" + +zip_override_hightput: + name: + - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tp4" + - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tep4" + - "minimax-m2.5-nvfp4-b200-1k1k-hightput-tp8" + resources: + gpus_per_agg: [4, 4, 8] + backend: + vllm_config: + aggregated: + tensor-parallel-size: [4, 4, 8] + enable-expert-parallel: [false, true, false] + benchmark: + concurrencies: ["4x8x16x32x64x128x256x512", "32x64x128", "4"] diff --git a/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml b/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml new file mode 100644 index 00000000..7d817e73 --- /dev/null +++ b/recipes/vllm/minimax-m2.5/b200-fp4/8k1k.yaml @@ -0,0 +1,88 @@ +# MiniMax-M2.5 NVFP4 B200 — 8K/1K ISL/OSL +# Aggregated vLLM, single-node +# requires github.com/NVIDIA/srt-slurm, branch sa-submission-q2-2026 +# usage examples: +# srtctl apply -f 8k1k.yaml # run all variants +# srtctl apply -f 8k1k.yaml:zip_override_lowlat # full lowlat sweep +# srtctl apply -f 8k1k.yaml:zip_override_lowlat[2] # lowlat, tep2 variant only +# srtctl apply -f 8k1k.yaml:zip_override_maxtput # full max tput sweep +# srtctl dry-run -f 8k1k.yaml # preview the variants + +base: + name: "minimax-m2.5-nvfp4-b200-8k1k" + + model: + path: "minimax_m2.5_fp4" + container: "vllm/vllm-openai:v0.19.0-cu130" + precision: "fp4" + + resources: + gpu_type: "b200" + gpus_per_node: 8 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 1 + + frontend: + type: dynamo + enable_multiple_frontends: false + + dynamo: + install: true + top_of_tree: true # currently need ToT for vllm 0.19.0 + + setup_script: vllm-container-deps.sh + + backend: + type: vllm + + aggregated_environment: + DYN_HEALTH_CHECK_ENABLED: "false" + PYTHONUNBUFFERED: "1" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + gpu-memory-utilization: 0.90 + max-model-len: 9416 + max-num-batched-tokens: 16384 + kv-cache-dtype: fp8 + max-cudagraph-capture-size: 2048 + stream-interval: 20 + no-enable-prefix-caching: true + trust-remote-code: true + + benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + +zip_override_lowlat: + name: + - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tp1" + - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tp2" + - "minimax-m2.5-nvfp4-b200-8k1k-lowlat-tep2" + resources: + gpus_per_agg: [1, 2, 2] + backend: + vllm_config: + aggregated: + tensor-parallel-size: [1, 2, 2] + enable-expert-parallel: [false, false, true] + benchmark: + concurrencies: ["4x8x16x32x256x512", "4x8x16x32x64x128x256x512", "128x256x512"] + +zip_override_maxtput: + name: + - "minimax-m2.5-nvfp4-b200-8k1k-maxtput-tp4" + - "minimax-m2.5-nvfp4-b200-8k1k-maxtput-tp8" + resources: + gpus_per_agg: [4, 8] + backend: + vllm_config: + aggregated: + tensor-parallel-size: [4, 8] + enable-expert-parallel: false + benchmark: + concurrencies: ["4x8x16x32x64x128x256x512", "4"] From f61dbba884147e3a20536c1369f9c8dcc2372cfb Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:16:29 -0700 Subject: [PATCH 05/14] Add lm-eval benchmark runner for InferenceX evals (#12) * Add lm-eval benchmark runner for InferenceX evals Adds support for running lm-eval accuracy evaluations as a post-benchmark step, leveraging the InferenceX benchmark_lib.sh harness. --- docs/accuracy.md | 84 +++- src/srtctl/benchmarks/__init__.py | 3 +- src/srtctl/benchmarks/lm_eval.py | 58 +++ .../benchmarks/scripts/lm-eval/bench.sh | 77 ++++ src/srtctl/cli/do_sweep.py | 136 +++++- src/srtctl/core/runtime.py | 8 + tests/test_benchmarks.py | 418 ++++++++++++++++++ tests/test_configs.py | 110 +++++ 8 files changed, 890 insertions(+), 4 deletions(-) create mode 100644 src/srtctl/benchmarks/lm_eval.py create mode 100755 src/srtctl/benchmarks/scripts/lm-eval/bench.sh diff --git a/docs/accuracy.md b/docs/accuracy.md index f5588c9f..98b69b46 100644 --- a/docs/accuracy.md +++ b/docs/accuracy.md @@ -1,6 +1,6 @@ # Accuracy Benchmarks -In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. +In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa`, `longbenchv2`, and `lm-eval`. ## Table of Contents @@ -14,6 +14,7 @@ In srt-slurm, users can run different accuracy benchmarks by setting the benchma - [Example: Quick Validation](#example-quick-validation) - [Output](#output) - [Important Notes](#important-notes) +- [lm-eval (InferenceX)](#lm-eval-inferencex) --- @@ -191,3 +192,84 @@ The output includes per-category scores and aggregate metrics: 4. **Categories**: Running specific categories is useful for targeted validation (e.g., just testing summarization capabilities) +## lm-eval (InferenceX) + +The `lm-eval` benchmark runner integrates [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) via InferenceX's `benchmark_lib.sh`. Unlike the built-in benchmarks above, this runner sources evaluation logic from an external InferenceX workspace mounted at `/infmax-workspace`. + +This is used by InferenceX CI to run evals such as GSM8K and GPQA against NVIDIA multi-node disaggregated deployments on GB200, GB300, B200, B300, H100, and H200. AMD MI355X multi-node evals are handled by InferenceX's upstreamed AMD Slurm path, not by this srt-slurm runner. + +In InferenceX CI, recipes normally keep their throughput benchmark configuration. `do_sweep.py` invokes the registered `lm-eval` runner as a post-step when `RUN_EVAL=true`, or as the only benchmark-like step when `EVAL_ONLY=true`. There is no separate `infmax-eval` benchmark type. + +### How it works + +1. `RuntimeContext` mounts the host path from `INFMAX_WORKSPACE` at `/infmax-workspace` inside the Slurm container. +2. `do_sweep.py` starts infrastructure, workers, and the frontend for the normal recipe topology. +3. For `EVAL_ONLY=true`, `do_sweep.py` skips the throughput benchmark stage and runs `_run_post_eval()` directly after frontend startup. +4. `_run_post_eval()` waits for the OpenAI-compatible endpoint on port 8000 and, in eval-only mode, performs the full `wait_for_model()` health check for the configured prefill/decode or aggregated topology. +5. `_run_post_eval()` launches the registered `lm-eval` runner on the head node and passes through InferenceX metadata such as framework, precision, sequence length, prefill/decode topology, and eval concurrency. +6. The runner script (`benchmarks/scripts/lm-eval/bench.sh`) uses `MODEL_NAME` from `do_sweep.py`, or auto-discovers the served model from `/v1/models` as a fallback. +7. The runner sources `/infmax-workspace/benchmarks/benchmark_lib.sh`, runs `run_eval --framework lm-eval`, and calls `append_lm_eval_summary`. +8. Eval artifacts are copied to `/logs/eval_results/` for InferenceX launcher-side artifact pickup. + +### EVAL_ONLY mode + +srt-slurm supports an `EVAL_ONLY` mode for CI jobs that should only validate accuracy. This is controlled by environment variables from the InferenceX workflow: + +| Env var | Description | +|---------|-------------| +| `EVAL_ONLY` | Set to `true` to skip the throughput benchmark stage and run eval only | +| `RUN_EVAL` | Set to `true` to run eval after the throughput benchmark completes | +| `EVAL_CONC` | Concurrent requests for lm-eval, normally set by InferenceX from the generated `eval-conc` value | +| `INFMAX_WORKSPACE` | Host path to the InferenceX checkout that should be mounted at `/infmax-workspace` | +| `MODEL_NAME` | Served model alias for OpenAI-compatible requests; set by `do_sweep.py` from `config.served_model_name` | + +When `EVAL_ONLY=true`: +- Stage 4 skips the throughput benchmark entirely. No throughput result JSON is expected from srt-slurm. +- The eval path uses the full `wait_for_model()` health check before starting lm-eval. +- `_run_post_eval()` launches the `lm-eval` runner and returns its exit code. +- Eval failure is fatal because eval is the only purpose of the job. + +When `RUN_EVAL=true` (without `EVAL_ONLY`): +- Throughput benchmark runs normally +- After benchmark completes successfully, eval runs as a post-step +- Eval failure is non-fatal; the benchmark job still succeeds if throughput passed + +### Environment variables + +The following env vars are passed through to the lm-eval runner container: + +| Env var | Purpose | +|---------|---------| +| `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE` | Control whether eval runs and how InferenceX classifies the artifact | +| `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `SPEC_DECODING` | Benchmark identity metadata for `meta_env.json` | +| `ISL`, `OSL`, `RESULT_FILENAME` | Sequence length and result-file metadata | +| `MODEL`, `MODEL_PATH`, `MODEL_NAME` | Model metadata and the served model alias used for requests | +| `MAX_MODEL_LEN`, `EVAL_MAX_MODEL_LEN` | Context-length metadata used by InferenceX eval helpers when available | +| `PREFILL_TP`, `PREFILL_EP`, `PREFILL_NUM_WORKERS`, `PREFILL_DP_ATTN` | Prefill-side topology metadata | +| `DECODE_TP`, `DECODE_EP`, `DECODE_NUM_WORKERS`, `DECODE_DP_ATTN` | Decode-side topology metadata | +| `EVAL_CONC`, `EVAL_CONCURRENT_REQUESTS` | Eval concurrency controls | + +The runner maps srt-slurm's `PREFILL_DP_ATTN` and `DECODE_DP_ATTN` names to InferenceX's `PREFILL_DP_ATTENTION` and `DECODE_DP_ATTENTION` names before calling `append_lm_eval_summary`. This is required for multi-node summary tables to preserve prefill/decode DPA state. + +### Concurrency + +Eval concurrency is ultimately read by InferenceX's `benchmark_lib.sh` from `EVAL_CONCURRENT_REQUESTS`. The runner script sets that value from `EVAL_CONC` when present, preserves an existing `EVAL_CONCURRENT_REQUESTS` otherwise, and falls back to `256` only if neither variable is set: + +```bash +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}" +``` + +The InferenceX workflow sets `EVAL_CONC` from the generated `eval-conc` value. For multi-node configs, InferenceX selects the `8k1k` entry with the highest max eligible concurrency for each `(model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)` group, then sets `eval-conc` to the upper median of that config's eligible concurrency list. If `EVAL_CONC` is not set in the environment, `do_sweep.py` falls back to the max of the recipe benchmark concurrency list. + +### Output + +Eval artifacts are written to `/logs/eval_results/` inside the container: +- `meta_env.json` - metadata used by InferenceX aggregation and summary tables +- `results*.json` - lm-eval scores per task +- `sample*.jsonl` - per-sample outputs + +These are collected by the InferenceX NVIDIA launch scripts and uploaded as workflow artifacts. In eval-only mode the InferenceX workflow expects eval artifacts, not throughput benchmark artifacts. + +### Intricacies +1. Eval floor of 16 + - There is 1 sweep config of conc: [1], which causes evals to take >4hrs to complete. diff --git a/src/srtctl/benchmarks/__init__.py b/src/srtctl/benchmarks/__init__.py index 3a2d6449..088617a6 100644 --- a/src/srtctl/benchmarks/__init__.py +++ b/src/srtctl/benchmarks/__init__.py @@ -4,7 +4,7 @@ """Benchmark runners for srtctl.""" # Import runners to trigger registration -from srtctl.benchmarks import gpqa, gsm8k, longbenchv2, mmlu, mooncake_router, router, sa_bench, sglang_bench +from srtctl.benchmarks import gpqa, gsm8k, lm_eval, longbenchv2, mmlu, mooncake_router, router, sa_bench, sglang_bench from srtctl.benchmarks.base import ( BenchmarkRunner, get_runner, @@ -18,6 +18,7 @@ "list_benchmarks", "register_benchmark", # Runners + "lm_eval", "sa_bench", "sglang_bench", "mmlu", diff --git a/src/srtctl/benchmarks/lm_eval.py b/src/srtctl/benchmarks/lm_eval.py new file mode 100644 index 00000000..c63ec097 --- /dev/null +++ b/src/srtctl/benchmarks/lm_eval.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 SemiAnalysis LLC. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""lm-eval benchmark runner for InferenceX evals.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark + +if TYPE_CHECKING: + from srtctl.core.runtime import RuntimeContext + from srtctl.core.schema import SrtConfig + + +@register_benchmark("lm-eval") +class LMEvalRunner(BenchmarkRunner): + """lm-eval accuracy evaluation using InferenceX benchmark_lib. + + Runs lm-eval via the InferenceX benchmark_lib.sh harness, + which handles task selection, result collection, and summary generation. + """ + + @property + def name(self) -> str: + return "lm-eval" + + @property + def script_path(self) -> str: + return "/srtctl-benchmarks/lm-eval/bench.sh" + + @property + def local_script_dir(self) -> str: + return str(SCRIPTS_DIR / "lm-eval") + + def validate_config(self, config: SrtConfig) -> list[str]: + # lm-eval has sensible defaults + return [] + + def build_command( + self, + config: SrtConfig, + runtime: RuntimeContext, + ) -> list[str]: + endpoint = f"http://localhost:{runtime.frontend_port}" + # Always use the container mount path, not the host path. + # INFMAX_WORKSPACE env var contains the host path (used for mount setup + # in runtime.py), but inside the container it's at /infmax-workspace. + infmax_workspace = "/infmax-workspace" + + return [ + "bash", + self.script_path, + endpoint, + infmax_workspace, + ] diff --git a/src/srtctl/benchmarks/scripts/lm-eval/bench.sh b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh new file mode 100755 index 00000000..a10e4e7d --- /dev/null +++ b/src/srtctl/benchmarks/scripts/lm-eval/bench.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 SemiAnalysis LLC. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# lm-eval accuracy evaluation using InferenceX benchmark_lib +# Expects: endpoint [infmax_workspace] + +set -e + +ENDPOINT=$1 +INFMAX_WORKSPACE=${2:-/infmax-workspace} + +# Extract HOST and PORT from endpoint (e.g., http://localhost:8000) +HOST=$(echo "$ENDPOINT" | sed -E 's|https?://||; s|:.*||') +PORT=$(echo "$ENDPOINT" | sed -E 's|.*:([0-9]+).*|\1|') + +echo "lm-eval Config: endpoint=${ENDPOINT}; host=${HOST}; port=${PORT}; workspace=${INFMAX_WORKSPACE}" + +# Auto-discover the served model name from /v1/models if MODEL_NAME is not set. +# This ensures we use the exact name the server recognizes, regardless of what +# $MODEL (the HuggingFace ID from the workflow) is set to. +if [[ -z "${MODEL_NAME:-}" ]]; then + DISCOVERED_MODEL=$(curl -sf "${ENDPOINT}/v1/models" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data'][0]['id'])" 2>/dev/null || true) + if [[ -n "$DISCOVERED_MODEL" ]]; then + export MODEL_NAME="$DISCOVERED_MODEL" + echo "Auto-discovered MODEL_NAME from /v1/models: ${MODEL_NAME}" + else + echo "WARNING: Could not discover model name from /v1/models, using MODEL_NAME=${MODEL_NAME:-$MODEL}" + fi +else + echo "Using MODEL_NAME from environment: ${MODEL_NAME}" +fi + +# cd to workspace so that relative paths (e.g., utils/evals/*.yaml) resolve +cd "${INFMAX_WORKSPACE}" + +# Source the InferenceX benchmark library +source "${INFMAX_WORKSPACE}/benchmarks/benchmark_lib.sh" + +# Run lm-eval via benchmark_lib +# EVAL_CONC is set by the InferenceX workflow (median of conc list). +# benchmark_lib reads concurrency from EVAL_CONCURRENT_REQUESTS env var. +export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-256}}" +echo "Running lm-eval with concurrent-requests=${EVAL_CONCURRENT_REQUESTS}..." +eval_rc=0 +run_eval --framework lm-eval --port "$PORT" || eval_rc=$? + +# Derive metadata env vars that append_lm_eval_summary needs but do_sweep.py +# does not pass directly (it passes PREFILL_TP/EP/etc, not TP/EP_SIZE/CONC). +export IS_MULTINODE="${IS_MULTINODE:-true}" +export TP="${TP:-${PREFILL_TP:-1}}" +export CONC="${CONC:-${EVAL_CONC:-${EVAL_CONCURRENT_REQUESTS:-1}}}" +export EP_SIZE="${EP_SIZE:-${PREFILL_EP:-1}}" +export DP_ATTENTION="${DP_ATTENTION:-${PREFILL_DP_ATTN:-false}}" +# Remap srt-slurm's DP_ATTN names to InferenceX's DP_ATTENTION names +export PREFILL_DP_ATTENTION="${PREFILL_DP_ATTENTION:-${PREFILL_DP_ATTN:-${DP_ATTENTION:-false}}}" +export DECODE_DP_ATTENTION="${DECODE_DP_ATTENTION:-${DECODE_DP_ATTN:-${DP_ATTENTION:-false}}}" + +# Generate the lm-eval summary +echo "Generating lm-eval summary..." +append_lm_eval_summary || true + +# Copy eval artifacts to /logs/eval_results/ +mkdir -p /logs/eval_results +echo "Copying eval artifacts to /logs/eval_results/..." +cp -v meta_env.json /logs/eval_results/ 2>/dev/null || true +cp -v results*.json /logs/eval_results/ 2>/dev/null || true +cp -v sample*.jsonl /logs/eval_results/ 2>/dev/null || true + +if [[ "$eval_rc" -ne 0 ]]; then + echo "lm-eval evaluation failed with exit code ${eval_rc}" + exit "$eval_rc" +fi + +echo "lm-eval evaluation complete" diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index ff6eaa91..77b79ac5 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -18,6 +18,7 @@ import os import sys import threading +import time from dataclasses import dataclass from pathlib import Path @@ -179,6 +180,118 @@ def _print_connection_info(self) -> None: logger.info("=" * 60) logger.info("") + def _run_post_eval(self, stop_event: threading.Event) -> int: + """Run lm-eval after the main benchmark completes (or directly in eval-only mode).""" + from srtctl.benchmarks import get_runner + from srtctl.core.health import wait_for_model + + # In eval-only mode the benchmark health check was skipped, so do the + # full model-ready wait here. In post-benchmark mode a quick port + # check is sufficient since the server already served traffic. + if os.environ.get("EVAL_ONLY", "false").lower() == "true": + r = self.config.resources + n_prefill = 0 if r.num_agg > 0 else r.num_prefill + n_decode = r.num_agg if r.num_agg > 0 else r.num_decode + hc = self.config.health_check + logger.info("EVAL_ONLY: Waiting for server health before eval...") + if not wait_for_model( + host=self.runtime.nodes.head, + port=8000, + n_prefill=n_prefill, + n_decode=n_decode, + poll_interval=float(hc.interval_seconds), + timeout=float(hc.max_attempts * hc.interval_seconds), + report_every=60.0, + frontend_type=self.config.frontend.type, + stop_event=stop_event, + ): + logger.error("Server did not become healthy for eval") + return 1 + else: + if not wait_for_port(self.runtime.nodes.head, 8000, timeout=30): + logger.error("Server health check failed before eval - skipping") + return 1 + + try: + runner = get_runner("lm-eval") + except ValueError as e: + logger.error("lm-eval runner not available: %s", e) + return 1 + + eval_log = self.runtime.log_dir / "eval.out" + cmd = runner.build_command(self.config, self.runtime) + + logger.info("Eval command: %s", " ".join(cmd)) + logger.info("Eval log: %s", eval_log) + + # Pass through eval-related env vars. InferenceX writes multi-node + # metadata from these variables in append_lm_eval_summary(). + env_to_set = {} + for var in [ + "RUN_EVAL", + "EVAL_ONLY", + "IS_MULTINODE", + "FRAMEWORK", + "PRECISION", + "MODEL_PREFIX", + "RUNNER_TYPE", + "RESULT_FILENAME", + "SPEC_DECODING", + "ISL", + "OSL", + "MODEL", + "MODEL_PATH", + "MAX_MODEL_LEN", + "EVAL_MAX_MODEL_LEN", + "PREFILL_TP", + "PREFILL_EP", + "PREFILL_DP_ATTN", + "PREFILL_NUM_WORKERS", + "DECODE_TP", + "DECODE_EP", + "DECODE_DP_ATTN", + "DECODE_NUM_WORKERS", + ]: + val = os.environ.get(var) + if val: + env_to_set[var] = val + + # Set MODEL_NAME to the served model name so lm-eval uses the correct + # name for API requests. Without this, benchmark_lib.sh falls back to + # $MODEL (the HuggingFace ID) which the server doesn't recognize. + env_to_set["MODEL_NAME"] = self.config.served_model_name + logger.info("Eval MODEL_NAME: %s", env_to_set["MODEL_NAME"]) + + # Use EVAL_CONC from workflow (median chosen by InferenceX mark_eval_entries), + # falling back to max of benchmark concurrency list. + eval_conc = os.environ.get("EVAL_CONC") + if eval_conc: + env_to_set["EVAL_CONC"] = eval_conc + logger.info("Eval concurrency (from workflow): %s", eval_conc) + else: + conc_list = self.config.benchmark.get_concurrency_list() + if conc_list: + env_to_set["EVAL_CONC"] = str(max(conc_list)) + logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"]) + + proc = start_srun_process( + command=cmd, + nodelist=[self.runtime.nodes.head], + output=str(eval_log), + container_image=str(self.runtime.container_image), + container_mounts=self.runtime.container_mounts, + env_to_set=env_to_set, + ) + + while proc.poll() is None: + if stop_event.is_set(): + logger.info("Stop requested, terminating eval") + proc.terminate() + return 1 + time.sleep(1) + + return proc.returncode or 0 + def run(self) -> int: """Run the complete sweep.""" # Create status reporter (fire-and-forget, no-op if not configured) @@ -221,8 +334,27 @@ def run(self) -> int: self._print_connection_info() - # Stage 4: Benchmark (status reported AFTER health check passes) - exit_code = self.run_benchmark(registry, stop_event, reporter) + if os.environ.get("EVAL_ONLY", "false").lower() == "true": + reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running eval-only evaluation") + logger.info("EVAL_ONLY=true: Skipping benchmark stage and running lm-eval evaluation...") + exit_code = self._run_post_eval(stop_event) + if exit_code != 0: + logger.error("Eval-only evaluation failed with exit code %d", exit_code) + else: + logger.info("Eval-only evaluation completed successfully") + else: + # Stage 4: Benchmark (status reported AFTER health check passes) + exit_code = self.run_benchmark(registry, stop_event, reporter) + + # Stage 5: Post-benchmark eval (optional, non-fatal) + if os.environ.get("RUN_EVAL", "false").lower() == "true" and exit_code == 0: + reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running post-benchmark evaluation") + logger.info("RUN_EVAL=true: Running post-benchmark lm-eval evaluation...") + eval_exit = self._run_post_eval(stop_event) + if eval_exit != 0: + logger.warning("Eval failed with exit code %d (benchmark result is still valid)", eval_exit) + else: + logger.info("Post-benchmark eval completed successfully") except Exception as e: logger.exception("Error during sweep: %s", e) diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py index 3e68bdd5..31195ed3 100644 --- a/src/srtctl/core/runtime.py +++ b/src/srtctl/core/runtime.py @@ -231,6 +231,14 @@ def from_config( host_path, container_path = mount_spec.split(":", 1) container_mounts[Path(host_path).resolve()] = Path(container_path) + # Mount InferenceX workspace if available (for lm-eval support). + # Skip exists() check: the orchestrator runs on the SLURM head node + # where the GH Actions workspace path may not be directly accessible, + # but it IS accessible from compute nodes via shared filesystem. + infmax_ws = os.environ.get("INFMAX_WORKSPACE") + if infmax_ws: + container_mounts[Path(infmax_ws)] = Path("/infmax-workspace") + # Add FormattablePath mounts from config.container_mounts # These need to be expanded with the runtime context, so we create a # temporary context first and then update diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 261020c7..c15759b2 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -193,6 +193,62 @@ def test_build_command_includes_tokenizer_path(self): assert cmd[7] == "/model" # tokenizer path +class TestLMEvalRunner: + """Test LM-Eval runner.""" + + def test_registry_includes_lm_eval(self): + """lm-eval is in the benchmark registry.""" + assert "lm-eval" in list_benchmarks() + + def test_get_runner(self): + """Can get lm-eval runner.""" + runner = get_runner("lm-eval") + assert runner.name == "lm-eval" + + def test_script_path(self): + """Script path points to lm-eval bench.sh.""" + runner = get_runner("lm-eval") + assert "lm-eval/bench.sh" in runner.script_path + + def test_local_script_dir(self): + """Local script dir points to lm-eval scripts.""" + runner = get_runner("lm-eval") + assert runner.local_script_dir.endswith("lm-eval") + + def test_validate_config_always_valid(self): + """lm-eval accepts any config.""" + from srtctl.benchmarks.lm_eval import LMEvalRunner + from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig + + runner = LMEvalRunner() + config = SrtConfig( + name="test", + model=ModelConfig(path="/model", container="/image", precision="fp4"), + resources=ResourceConfig(gpu_type="h100"), + benchmark=BenchmarkConfig(type="sa-bench"), + ) + assert runner.validate_config(config) == [] + + def test_build_command(self): + """build_command returns correct bash command.""" + from unittest.mock import MagicMock + + from srtctl.benchmarks.lm_eval import LMEvalRunner + + runner = LMEvalRunner() + runtime = MagicMock() + runtime.frontend_port = 8000 + + config = MagicMock() + cmd = runner.build_command(config, runtime) + assert cmd == [ + "bash", + "/srtctl-benchmarks/lm-eval/bench.sh", + "http://localhost:8000", + "/infmax-workspace", + ] + + class TestScriptsExist: """Test that benchmark scripts exist.""" @@ -209,3 +265,365 @@ def test_mmlu_script_exists(self): """MMLU script exists.""" script = SCRIPTS_DIR / "mmlu" / "bench.sh" assert script.exists() + + +class TestRunPostEval: + """Test SweepOrchestrator._run_post_eval method.""" + + @staticmethod + def _make_orchestrator(): + """Create a SweepOrchestrator with mocked config/runtime.""" + from pathlib import Path + + from srtctl.cli.do_sweep import SweepOrchestrator + from srtctl.core.runtime import Nodes, RuntimeContext + from srtctl.core.schema import ( + BenchmarkConfig, + FrontendConfig, + HealthCheckConfig, + ModelConfig, + ResourceConfig, + SrtConfig, + ) + + config = SrtConfig( + name="test", + model=ModelConfig(path="/model/test-model", container="/image", precision="fp4"), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=2, + prefill_workers=1, + decode_workers=2, + ), + benchmark=BenchmarkConfig(type="sa-bench", isl=1024, osl=1024, concurrencies="128x256x512"), + health_check=HealthCheckConfig(max_attempts=3, interval_seconds=1), + frontend=FrontendConfig(type="dynamo"), + ) + runtime = RuntimeContext( + job_id="12345", + run_name="test-run", + nodes=Nodes(head="node0", bench="node0", infra="node0", worker=("node0", "node1", "node2")), + head_node_ip="10.0.0.1", + infra_node_ip="10.0.0.1", + log_dir=Path("/tmp/logs"), + model_path=Path("/model/test-model"), + container_image=Path("/path/to/container.sqsh"), + gpus_per_node=8, + network_interface=None, + container_mounts={}, + environment={}, + ) + return SweepOrchestrator(config=config, runtime=runtime) + + def test_post_benchmark_port_check_fails(self): + """Returns 1 when port check fails in post-benchmark mode.""" + import os + import threading + from unittest.mock import patch + + orch = self._make_orchestrator() + stop = threading.Event() + with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=False): + result = orch._run_post_eval(stop) + assert result == 1 + + def test_eval_only_health_check_fails(self): + """Returns 1 when health check fails in eval-only mode.""" + import os + import threading + from unittest.mock import patch + + orch = self._make_orchestrator() + stop = threading.Event() + with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False): + with patch("srtctl.core.health.wait_for_model", return_value=False): + result = orch._run_post_eval(stop) + assert result == 1 + + def test_runner_not_available(self): + """Returns 1 when lm-eval runner is not registered.""" + import os + import threading + from unittest.mock import patch + + orch = self._make_orchestrator() + stop = threading.Event() + with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.benchmarks.get_runner", side_effect=ValueError("not found")): + result = orch._run_post_eval(stop) + assert result == 1 + + def test_successful_eval(self): + """Returns 0 when eval completes successfully.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + + mock_proc = MagicMock() + mock_proc.poll.side_effect = [None, 0] + mock_proc.returncode = 0 + + with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc): + result = orch._run_post_eval(stop) + assert result == 0 + + def test_eval_only_successful(self): + """Returns 0 in eval-only mode when health check and eval succeed.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + + mock_proc = MagicMock() + mock_proc.poll.side_effect = [None, 0] + mock_proc.returncode = 0 + + with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False): + with patch("srtctl.core.health.wait_for_model", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc): + result = orch._run_post_eval(stop) + assert result == 0 + + def test_env_var_passthrough(self): + """Eval env vars are passed through to srun.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + + mock_proc = MagicMock() + mock_proc.poll.return_value = 0 + mock_proc.returncode = 0 + + env_vars = { + "EVAL_ONLY": "false", + "RUN_EVAL": "true", + "FRAMEWORK": "sglang", + "PRECISION": "fp4", + "MODEL": "test-model", + } + + captured_kwargs = {} + + def capture_srun(**kwargs): + captured_kwargs.update(kwargs) + return mock_proc + + with patch.dict(os.environ, env_vars, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun): + orch._run_post_eval(stop) + + env_to_set = captured_kwargs["env_to_set"] + assert env_to_set["RUN_EVAL"] == "true" + assert env_to_set["FRAMEWORK"] == "sglang" + assert env_to_set["PRECISION"] == "fp4" + assert env_to_set["MODEL"] == "test-model" + assert env_to_set["MODEL_NAME"] == "test-model" + + def test_eval_conc_from_env(self): + """EVAL_CONC from env takes priority over benchmark concurrencies.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + + mock_proc = MagicMock() + mock_proc.poll.return_value = 0 + mock_proc.returncode = 0 + + captured_kwargs = {} + + def capture_srun(**kwargs): + captured_kwargs.update(kwargs) + return mock_proc + + with patch.dict(os.environ, {"EVAL_ONLY": "false", "EVAL_CONC": "64"}, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun): + orch._run_post_eval(stop) + + assert captured_kwargs["env_to_set"]["EVAL_CONC"] == "64" + + def test_eval_conc_fallback_to_max_concurrency(self): + """EVAL_CONC falls back to max of benchmark concurrencies.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + + mock_proc = MagicMock() + mock_proc.poll.return_value = 0 + mock_proc.returncode = 0 + + captured_kwargs = {} + + def capture_srun(**kwargs): + captured_kwargs.update(kwargs) + return mock_proc + + env = {"EVAL_ONLY": "false"} + # Remove EVAL_CONC if present + with patch.dict(os.environ, env, clear=False): + os.environ.pop("EVAL_CONC", None) + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", side_effect=capture_srun): + orch._run_post_eval(stop) + + # concurrencies="128x256x512", max is 512 + assert captured_kwargs["env_to_set"]["EVAL_CONC"] == "512" + + def test_stop_event_terminates_eval(self): + """Stop event terminates the eval process.""" + import os + import threading + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + stop = threading.Event() + stop.set() + + mock_proc = MagicMock() + mock_proc.poll.return_value = None + + with patch.dict(os.environ, {"EVAL_ONLY": "false"}, clear=False): + with patch("srtctl.cli.do_sweep.wait_for_port", return_value=True): + with patch("srtctl.cli.do_sweep.start_srun_process", return_value=mock_proc): + result = orch._run_post_eval(stop) + + assert result == 1 + mock_proc.terminate.assert_called_once() + + +class TestSweepRunEvalIntegration: + """Test eval-related branches in SweepOrchestrator.run().""" + + @staticmethod + def _make_orchestrator(): + return TestRunPostEval._make_orchestrator() + + def test_run_eval_only_mode(self): + """EVAL_ONLY=true skips benchmark and runs _run_post_eval.""" + import os + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + + with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False): + with patch.object(orch, "start_head_infrastructure") as mock_head: + mock_head.return_value = MagicMock() + with patch.object(orch, "start_all_workers", return_value={}): + with patch.object(orch, "start_frontend", return_value=[]): + with patch.object(orch, "_run_post_eval", return_value=0) as mock_eval: + with patch.object(orch, "run_benchmark") as mock_bench: + with patch.object(orch, "run_postprocess"): + with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls: + mock_reporter_cls.from_config.return_value = MagicMock() + exit_code = orch.run() + + mock_eval.assert_called_once() + mock_bench.assert_not_called() + assert exit_code == 0 + + def test_run_with_post_benchmark_eval(self): + """RUN_EVAL=true runs benchmark then _run_post_eval.""" + import os + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + + with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False): + with patch.object(orch, "start_head_infrastructure") as mock_head: + mock_head.return_value = MagicMock() + with patch.object(orch, "start_all_workers", return_value={}): + with patch.object(orch, "start_frontend", return_value=[]): + with patch.object(orch, "run_benchmark", return_value=0) as mock_bench: + with patch.object(orch, "_run_post_eval", return_value=0) as mock_eval: + with patch.object(orch, "run_postprocess"): + with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls: + mock_reporter_cls.from_config.return_value = MagicMock() + exit_code = orch.run() + + mock_bench.assert_called_once() + mock_eval.assert_called_once() + assert exit_code == 0 + + def test_run_eval_only_failure(self): + """EVAL_ONLY=true with eval failure returns non-zero exit code.""" + import os + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + + with patch.dict(os.environ, {"EVAL_ONLY": "true"}, clear=False): + with patch.object(orch, "start_head_infrastructure") as mock_head: + mock_head.return_value = MagicMock() + with patch.object(orch, "start_all_workers", return_value={}): + with patch.object(orch, "start_frontend", return_value=[]): + with patch.object(orch, "_run_post_eval", return_value=1): + with patch.object(orch, "run_postprocess"): + with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls: + mock_reporter_cls.from_config.return_value = MagicMock() + exit_code = orch.run() + + assert exit_code == 1 + + def test_run_post_benchmark_eval_failure_nonfatal(self): + """RUN_EVAL=true with eval failure still returns benchmark exit code 0.""" + import os + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + + with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False): + with patch.object(orch, "start_head_infrastructure") as mock_head: + mock_head.return_value = MagicMock() + with patch.object(orch, "start_all_workers", return_value={}): + with patch.object(orch, "start_frontend", return_value=[]): + with patch.object(orch, "run_benchmark", return_value=0): + with patch.object(orch, "_run_post_eval", return_value=1): + with patch.object(orch, "run_postprocess"): + with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls: + mock_reporter_cls.from_config.return_value = MagicMock() + exit_code = orch.run() + + assert exit_code == 0 + + def test_run_eval_skipped_when_benchmark_fails(self): + """RUN_EVAL=true but benchmark fails: eval is skipped.""" + import os + from unittest.mock import MagicMock, patch + + orch = self._make_orchestrator() + + with patch.dict(os.environ, {"EVAL_ONLY": "false", "RUN_EVAL": "true"}, clear=False): + with patch.object(orch, "start_head_infrastructure") as mock_head: + mock_head.return_value = MagicMock() + with patch.object(orch, "start_all_workers", return_value={}): + with patch.object(orch, "start_frontend", return_value=[]): + with patch.object(orch, "run_benchmark", return_value=1): + with patch.object(orch, "_run_post_eval") as mock_eval: + with patch.object(orch, "run_postprocess"): + with patch("srtctl.cli.do_sweep.StatusReporter") as mock_reporter_cls: + mock_reporter_cls.from_config.return_value = MagicMock() + exit_code = orch.run() + + mock_eval.assert_not_called() + assert exit_code == 1 diff --git a/tests/test_configs.py b/tests/test_configs.py index 86d79cdb..0b4138d5 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1382,3 +1382,113 @@ def test_agg_mode_no_disaggregation_flag(self): assert "--disaggregation-mode" not in cmd assert "--is-prefill-worker" not in cmd assert "--is-decode-worker" not in cmd + + +class TestInfmaxWorkspaceMount: + """Test that INFMAX_WORKSPACE env var creates a container mount.""" + + def test_infmax_workspace_mount_added(self, tmp_path): + """RuntimeContext includes /infmax-workspace mount when env var is set.""" + import os + import subprocess + from pathlib import Path + from unittest.mock import MagicMock, patch + + from srtctl.core.runtime import RuntimeContext + from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig + + model_path = tmp_path / "model" + model_path.mkdir() + container_path = tmp_path / "container.sqsh" + container_path.touch() + + slurm_env = { + "SLURM_JOB_ID": "12345", + "SLURM_JOBID": "12345", + "SLURM_NODELIST": "gpu-[01-02]", + "SLURM_JOB_NUM_NODES": "2", + "SRTCTL_SOURCE_DIR": str(Path(__file__).parent.parent), + "INFMAX_WORKSPACE": "/actions/runner/workspace", + } + + def mock_scontrol(cmd, **kwargs): + if cmd[0] == "scontrol" and "hostnames" in cmd: + result = MagicMock() + result.stdout = "gpu-01\ngpu-02" + result.returncode = 0 + return result + raise subprocess.CalledProcessError(1, cmd) + + with patch.dict(os.environ, slurm_env): + with patch("subprocess.run", mock_scontrol): + with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"): + config = SrtConfig( + name="test", + model=ModelConfig( + path=str(model_path), + container=str(container_path), + precision="fp8", + ), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=1, + ), + ) + runtime = RuntimeContext.from_config(config, job_id="12345") + + assert Path("/infmax-workspace") in runtime.container_mounts.values() + + def test_infmax_workspace_mount_not_added_without_env(self, tmp_path): + """RuntimeContext does not include /infmax-workspace without env var.""" + import os + import subprocess + from pathlib import Path + from unittest.mock import MagicMock, patch + + from srtctl.core.runtime import RuntimeContext + from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig + + model_path = tmp_path / "model" + model_path.mkdir() + container_path = tmp_path / "container.sqsh" + container_path.touch() + + slurm_env = { + "SLURM_JOB_ID": "12345", + "SLURM_JOBID": "12345", + "SLURM_NODELIST": "gpu-[01-02]", + "SLURM_JOB_NUM_NODES": "2", + "SRTCTL_SOURCE_DIR": str(Path(__file__).parent.parent), + } + + def mock_scontrol(cmd, **kwargs): + if cmd[0] == "scontrol" and "hostnames" in cmd: + result = MagicMock() + result.stdout = "gpu-01\ngpu-02" + result.returncode = 0 + return result + raise subprocess.CalledProcessError(1, cmd) + + with patch.dict(os.environ, slurm_env): + os.environ.pop("INFMAX_WORKSPACE", None) + with patch("subprocess.run", mock_scontrol): + with patch("srtctl.core.slurm.get_hostname_ip", return_value="10.0.0.1"): + config = SrtConfig( + name="test", + model=ModelConfig( + path=str(model_path), + container=str(container_path), + precision="fp8", + ), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=1, + ), + ) + runtime = RuntimeContext.from_config(config, job_id="12345") + + assert Path("/infmax-workspace") not in runtime.container_mounts.values() From 10f4ac9ca2ee3b882b126f3006ab082c9fd623a2 Mon Sep 17 00:00:00 2001 From: Richard Huo Date: Mon, 20 Apr 2026 11:55:22 -0700 Subject: [PATCH 06/14] fix: add glm5 dynamo trtllm benchmark support to sa submission branch (#47) * fix tokenizer for glm5 (#20) fix * add nvidia pre-release url (#22) --- src/srtctl/benchmarks/sa_bench.py | 4 + .../scripts/sa-bench/backend_request_func.py | 128 +++++++++++++++++- .../benchmarks/scripts/sa-bench/bench.sh | 22 ++- .../scripts/sa-bench/benchmark_serving.py | 9 ++ src/srtctl/core/schema.py | 8 +- 5 files changed, 162 insertions(+), 9 deletions(-) diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py index 9adc6678..5f220393 100644 --- a/src/srtctl/benchmarks/sa_bench.py +++ b/src/srtctl/benchmarks/sa_bench.py @@ -97,5 +97,9 @@ def build_command( str(prefill_gpus), str(decode_gpus), str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8", + str(b.num_prompts_mult) if b.num_prompts_mult is not None else "10", + str(b.num_warmup_mult) if b.num_warmup_mult is not None else "2", + b.custom_tokenizer or "", + str(b.use_chat_template).lower(), ] return cmd diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py index dd2cac44..87f3f9ef 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py @@ -511,10 +511,106 @@ def get_model(pretrained_model_name_or_path: str) -> str: return pretrained_model_name_or_path +def _resolve_tokenizer_file(model_name_or_path): + """Resolve tokenizer.json from a local directory or HF hub cache.""" + from pathlib import Path + + local_path = Path(model_name_or_path) / "tokenizer.json" + if local_path.is_file(): + return str(local_path) + try: + from huggingface_hub import hf_hub_download + + return hf_hub_download(model_name_or_path, "tokenizer.json", local_files_only=True) + except Exception: + return None + + +def _fix_v5_tokenizer_components(tokenizer, model_name_or_path): + """Fix pre_tokenizer/decoder when transformers v5 LlamaTokenizerFast overwrites them. + + In transformers v5, LlamaTokenizerFast.__init__ rebuilds the pre_tokenizer + and decoder from scratch, discarding the originals from tokenizer.json. + This breaks models like DeepSeek-R1 that declare LlamaTokenizerFast but + actually use a ByteLevel pre_tokenizer. + + Ported from sglang/python/sglang/srt/utils/hf_transformers_utils.py. + """ + backend = getattr(tokenizer, "_tokenizer", None) + if backend is None: + return + + try: + from tokenizers import Tokenizer as RawTokenizer + + tok_file = _resolve_tokenizer_file(model_name_or_path) + if tok_file is None: + return + raw = RawTokenizer.from_file(tok_file) + except Exception: + return + + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + + if raw_pre and loaded_pre and raw_pre != loaded_pre: + print( + f"[sa-bench] Fixing v5 tokenizer component mismatch for {model_name_or_path}: " + f"pre_tokenizer {loaded_pre} -> {raw_pre}, " + f"decoder {type(backend.decoder).__name__ if backend.decoder else None} " + f"-> {type(raw.decoder).__name__ if raw.decoder else None}", + flush=True, + ) + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + + +def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrainedTokenizerFast": + """Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json. + + Works around incompatibilities when the checkpoint was saved with + transformers 5.x (TokenizersBackend / list-style extra_special_tokens). + """ + import json + from pathlib import Path + + from tokenizers import Tokenizer as RustTokenizer + from transformers import PreTrainedTokenizerFast + + _SAFE_CONFIG_KEYS = ( + "pad_token", "pad_token_id", "eos_token", "eos_token_id", + "bos_token", "bos_token_id", "unk_token", "unk_token_id", + "model_max_length", "padding_side", "truncation_side", + ) + + path = Path(pretrained_model_name_or_path) + tokenizer_json = path / "tokenizer.json" + if not tokenizer_json.exists(): + raise FileNotFoundError( + f"Expected tokenizer.json at {tokenizer_json}. " + "GlmMoeDsaTokenizer loads from tokenizer.json only." + ) + + rust_tok = RustTokenizer.from_file(str(tokenizer_json)) + init_kwargs = {} + config_path = path / "tokenizer_config.json" + if config_path.exists(): + with open(config_path, encoding="utf-8") as f: + config = json.load(f) + for key in _SAFE_CONFIG_KEYS: + if key in config: + init_kwargs[key] = config[key] + if "extra_special_tokens" in config: + init_kwargs["additional_special_tokens"] = config["extra_special_tokens"] + + return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs) + + def get_tokenizer( pretrained_model_name_or_path: str, tokenizer_mode: str = "auto", trust_remote_code: bool = False, + custom_tokenizer: str | None = None, **kwargs, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path): @@ -533,12 +629,32 @@ def get_tokenizer( "to use mistral tokenizer mode." ) from e return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path)) - else: - return AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - **kwargs, - ) + + if custom_tokenizer: + if custom_tokenizer == "glm_moe_dsa": + return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path) + from importlib import import_module + try: + module_path, class_name = custom_tokenizer.rsplit('.', 1) + module = import_module(module_path) + tokenizer_class = getattr(module, class_name) + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) + except (ValueError, ImportError, AttributeError) as e: + raise ValueError( + f"Failed to load custom_tokenizer '{custom_tokenizer}'. " + "Expected 'glm_moe_dsa' or 'module.path.ClassName'.") from e + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) + _fix_v5_tokenizer_components(tokenizer, pretrained_model_name_or_path) + return tokenizer ASYNC_REQUEST_FUNCS = { diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh index ed907308..acddf754 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh +++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh @@ -60,6 +60,22 @@ TOTAL_GPUS=${9:-0} PREFILL_GPUS=${10:-0} DECODE_GPUS=${11:-0} RANDOM_RANGE_RATIO=${12:-0.8} +NUM_PROMPTS_MULT=${13:-10} +NUM_WARMUP_MULT=${14:-2} +CUSTOM_TOKENIZER=${15:-} +USE_CHAT_TEMPLATE=${16:-true} + +# Build optional custom tokenizer args +CUSTOM_TOKENIZER_ARGS=() +if [ -n "$CUSTOM_TOKENIZER" ]; then + CUSTOM_TOKENIZER_ARGS=(--custom-tokenizer "$CUSTOM_TOKENIZER") +fi + +# Build optional chat template args +CHAT_TEMPLATE_ARGS=() +if [ "$USE_CHAT_TEMPLATE" = "true" ]; then + CHAT_TEMPLATE_ARGS=(--use-chat-template) +fi # Parse endpoint into host:port HOST=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f1) @@ -119,7 +135,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do --request-rate 250 \ --percentile-metrics ttft,tpot,itl,e2el \ --max-concurrency "$concurrency" \ - --trust-remote-code + --trust-remote-code \ + "${CUSTOM_TOKENIZER_ARGS[@]}" num_prompts=$((concurrency * 10)) @@ -149,7 +166,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do --percentile-metrics ttft,tpot,itl,e2el \ --max-concurrency "$concurrency" \ --trust-remote-code \ - --use-chat-template \ + "${CHAT_TEMPLATE_ARGS[@]}" \ + "${CUSTOM_TOKENIZER_ARGS[@]}" \ --save-result --result-dir "$result_dir" --result-filename "$result_filename" set +x diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py index 4363ef6e..a5ea6490 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py @@ -837,6 +837,7 @@ def main(args: argparse.Namespace): tokenizer_id, tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code, + custom_tokenizer=args.custom_tokenizer, ) if args.dataset is not None: @@ -1279,6 +1280,14 @@ def main(args: argparse.Namespace): '"custom" will use --tokenizer to select the preregistered tokenizer.', ) + parser.add_argument( + "--custom-tokenizer", + type=str, + default=None, + help="Custom tokenizer to use (e.g., 'glm_moe_dsa' or 'module.path.ClassName'). " + "When set, overrides the default tokenizer loading.", + ) + parser.add_argument( "--served-model-name", type=str, diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index 085db6c8..c535be39 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -539,6 +539,12 @@ class BenchmarkConfig: ttft_threshold_ms: int | None = None # Goodput TTFT threshold in ms (default: 2000) itl_threshold_ms: int | None = None # Goodput ITL threshold in ms (default: 25) random_range_ratio: float | None = None # Random input/output length range ratio (default: 0.8) + num_prompts_mult: int | None = None # Multiplier for num_prompts = concurrency * mult (default: 10) + num_warmup_mult: int | None = None # Multiplier for warmup prompts = concurrency * mult (default: 2) + # Trace replay benchmark fields (uses aiperf with mooncake_trace dataset type) + trace_file: str | None = None # Path to trace JSONL file (container path, e.g., /traces/dataset.jsonl) + custom_tokenizer: str | None = None # Custom tokenizer class (e.g., "module.path.ClassName") + use_chat_template: bool = True # Pass --use-chat-template to benchmark (default: true) def get_concurrency_list(self) -> list[int]: if self.concurrencies is None: @@ -711,7 +717,7 @@ def get_install_commands(self) -> str: if self.version is not None: return ( f"echo 'Installing dynamo {self.version}...' && " - f"pip install --break-system-packages --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && " + f"pip install --break-system-packages --quiet --extra-index-url https://pypi.nvidia.com ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && " f"echo 'Dynamo {self.version} installed'" ) From a10acd3f097a24498f0fb18523544f87dc35bed5 Mon Sep 17 00:00:00 2001 From: Yeswanth koti Date: Tue, 21 Apr 2026 20:23:27 -0400 Subject: [PATCH 07/14] Add GLM5 disaggregated recipes for SA submission (#48) Add 66 GLM5 NVFP4 disaggregated recipe configs for GB200 and GB300 on the sa-submission branch; standardize model path and container values across the recipe set for consistency. --- ...ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++ ...ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++ ..._gen1dep32_batch16_allconc_eplb0_mtp3.yaml | 133 ++++++++++++ .../ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml | 134 ++++++++++++ ...4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml | 136 ++++++++++++ .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml | 131 ++++++++++++ ...2dep4_gen1dep16_batch256_eplb256_mtp1.yaml | 167 +++++++++++++++ ...3dep4_gen1dep32_batch128_eplb288_mtp1.yaml | 151 ++++++++++++++ ...ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++ ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 142 +++++++++++++ ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 126 +++++++++++ ...tx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++ ...2dep4_gen1dep16_batch512_eplb256_mtp0.yaml | 193 +++++++++++++++++ ...ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 133 ++++++++++++ ...4dep4_gen1dep32_batch256_eplb288_mtp0.yaml | 161 +++++++++++++++ ...tx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++ .../ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml | 133 ++++++++++++ ...p4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml | 133 ++++++++++++ .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml | 130 ++++++++++++ .../ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml | 130 ++++++++++++ .../ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++ ...ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++ ...ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml | 147 +++++++++++++ ...x10dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++ .../ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml | 130 ++++++++++++ .../ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml | 129 ++++++++++++ ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 126 +++++++++++ ...ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++ ...ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++ .../ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++ ...2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml | 134 ++++++++++++ .../ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 136 ++++++++++++ .../ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 131 ++++++++++++ ...ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml | 139 +++++++++++++ ...ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 133 ++++++++++++ ...ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml | 135 ++++++++++++ ...4dep2_gen1dep16_batch256_eplb256_mtp1.yaml | 166 +++++++++++++++ ...ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml | 195 ++++++++++++++++++ ...6dep2_gen1dep32_batch128_eplb288_mtp1.yaml | 150 ++++++++++++++ ...ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++ ...2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml | 134 ++++++++++++ ...p2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 125 +++++++++++ ...ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++ ...ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml | 133 ++++++++++++ ...4dep2_gen1dep16_batch512_eplb256_mtp0.yaml | 191 +++++++++++++++++ ...tx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml | 141 +++++++++++++ ...6dep2_gen1dep32_batch256_eplb288_mtp0.yaml | 160 ++++++++++++++ ...tx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml | 135 ++++++++++++ ...tx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml | 147 +++++++++++++ ...tx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 133 ++++++++++++ ...tx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml | 139 +++++++++++++ .../ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 134 ++++++++++++ .../ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml | 133 ++++++++++++ ...p2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml | 132 ++++++++++++ .../ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 131 ++++++++++++ .../ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 131 ++++++++++++ .../ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 132 ++++++++++++ ...tx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml | 133 ++++++++++++ ...tx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 129 ++++++++++++ .../ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 127 ++++++++++++ .../ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml | 126 +++++++++++ ...p2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 125 +++++++++++ ...x20dep2_gen1dep16_batch128_eplb0_mtp0.yaml | 141 +++++++++++++ .../ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 130 ++++++++++++ .../ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 134 ++++++++++++ ...ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 127 ++++++++++++ 66 files changed, 9122 insertions(+) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..21edc148 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,135 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# concurrency: 666 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 96 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..ebcd45d1 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,139 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..68af65ee --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrencies: 333 (batch8), 666 (batch16) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "333x666" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml new file mode 100644 index 00000000..d6d3dcf1 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml @@ -0,0 +1,134 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp2" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16 +# concurrency: 96 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 48 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "96" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..da187faf --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml @@ -0,0 +1,136 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32 +# concurrencies: 8 (batch1), 44 (batch8), 192 (batch32) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8x44x192" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..a6121cd0 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml @@ -0,0 +1,131 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1 +# concurrency: 10 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml new file mode 100644 index 00000000..dc176b2d --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml @@ -0,0 +1,167 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256 +# EPLB: num_slots=256 +# concurrency: 4301 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 256 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml new file mode 100644 index 00000000..a7a1c790 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml @@ -0,0 +1,151 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1" + +# ctx: 3 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 +# EPLB: num_slots=288 +# concurrency: 4301 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 288 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..7412a109 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..e969c07d --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml @@ -0,0 +1,142 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=128 +# Merged concurrencies: batch1(4), batch32(180), batch64(360), batch128(616) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x180x360x616" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..fb583747 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8 +# Merged concurrencies: batch1(5), batch2(15), batch4(30), batch8(50) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5x15x30x50" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..e057ce05 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,141 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# concurrency: 2253 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml new file mode 100644 index 00000000..d221dde2 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml @@ -0,0 +1,193 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch512_eplb256_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=512 +# EPLB: num_slots=256 +# concurrency: 8192 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 256 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..bbad79c1 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 +# concurrency: 2253 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml new file mode 100644 index 00000000..26d2d29e --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml @@ -0,0 +1,161 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0" + +# ctx: 4 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256 +# EPLB: num_slots=288 +# concurrency: 8192 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 288 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..420192c2 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,139 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1" + +# ctx: 10 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 10 + prefill_workers: 10 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..da3186e5 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 2 decode workers, TP8/EP8, max_batch=16, concurrency: 46 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "46" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..fb94a549 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, max_batch=8 +# concurrencies: 4 (batch1), 48 (batch8) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x48" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..0a13cce4 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml @@ -0,0 +1,130 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=1, concurrency: 5 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml new file mode 100644 index 00000000..440a4f73 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml @@ -0,0 +1,130 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep32_batch4_eplb0_mtp3" + +# ctx: 3 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, max_batch=4, concurrency: 167 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "167" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..492f1b4c --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml @@ -0,0 +1,132 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8 +# concurrency: 333 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..d22fbcf1 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,135 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2" + +# ctx: 7 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# concurrency: 615 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 96 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml new file mode 100644 index 00000000..804e89b5 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml @@ -0,0 +1,147 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1" + +# ctx: 7 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128 +# concurrency: 1076 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1076" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..0fa8566d --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,141 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch128_eplb0_mtp0" + +# ctx: 10 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# concurrency: 2253 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 10 + prefill_workers: 10 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..478f6203 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,130 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 2 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32 +# concurrency: 84 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "84" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..462401b6 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen3tep4_batch32_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 3 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=32 +# concurrency: 117 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "117" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..90e62af3 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8 +# Merged concurrencies: batch1(5), batch2(10), batch4(25), batch8(50) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5x10x25x50" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..7a6ece31 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrency: 615 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..7e34b6d9 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp0" + +# ctx: 8 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 8 + prefill_workers: 8 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..80aacc6a --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -0,0 +1,132 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen1dep32_batch8_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8 +# concurrency: 333 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..648ec949 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml @@ -0,0 +1,134 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16 +# concurrencies: 24 (batch4), 44 (batch8), 92 (batch16) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "24x44x92" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..823624ac --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml @@ -0,0 +1,136 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch32_eplb0_mtp2" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32 +# concurrency: 180 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 96 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..64b61b9f --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -0,0 +1,131 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen5tep4_batch1_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1 +# concurrency: 10 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml new file mode 100644 index 00000000..66d211aa --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml @@ -0,0 +1,139 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep16_batch64_eplb0_mtp2" + +# ctx: 2 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 192 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..fe754372 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep32_batch16_eplb0_mtp3" + +# ctx: 2 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrency: 666 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..70821f3e --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml @@ -0,0 +1,135 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep2_gen1dep32_batch32_eplb0_mtp2" + +# ctx: 3 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 96 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml new file mode 100644 index 00000000..bf3183b7 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml @@ -0,0 +1,166 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep16_batch256_eplb256_mtp1" + +# ctx: 4 prefill workers, TP2/EP2, EPLB: num_slots=256, max_batch=256 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256 +# concurrency: 4301 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 256 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml new file mode 100644 index 00000000..1d9f4f10 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml @@ -0,0 +1,195 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx5dep2_gen2dep8_batch512_eplb0_mtp1" + +# ctx: 5 prefill workers, TP2/EP2 +# gen: 2 decode workers, TP8/EP8, enable_attention_dp=true, max_batch=512 +# concurrency: 8602 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 3 + prefill_workers: 5 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8602" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml new file mode 100644 index 00000000..44b81b3c --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml @@ -0,0 +1,150 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx6dep2_gen1dep32_batch128_eplb288_mtp1" + +# ctx: 6 prefill workers, TP2/EP2, EPLB: num_slots=288, max_batch=128 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 +# concurrency: 4301 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 3 + prefill_workers: 6 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 288 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..0410623b --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen1dep32_batch16_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrency: 615 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..d967e3b2 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml @@ -0,0 +1,134 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=64 +# Merged concurrencies: batch16(84), batch32(180), batch64(336) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "84x180x336" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..d9f9ea2f --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -0,0 +1,125 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=4 +# Merged concurrencies: batch1(5), batch2(10), batch4(25) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5x10x25" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..26ddd7b1 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep2_gen1dep32_batch32_eplb0_mtp0" + +# ctx: 2 prefill workers, TP2/EP2, max_batch=32 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..081e96da --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep2_gen1dep32_batch64_eplb0_mtp0" + +# ctx: 3 prefill workers, TP2/EP2, max_batch=64 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 +# concurrency: 2253 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml new file mode 100644 index 00000000..dbca4fd5 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml @@ -0,0 +1,191 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep16_batch512_eplb256_mtp0" + +# ctx: 4 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, EPLB: num_slots=256, max_batch=512, concurrency: 8192 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 256 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..1c8d2d78 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml @@ -0,0 +1,141 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep2_gen1dep32_batch128_eplb0_mtp0" + +# ctx: 4 prefill workers, TP2/EP2, max_batch=128 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 +# concurrency: 4301 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml new file mode 100644 index 00000000..0d6870ff --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml @@ -0,0 +1,160 @@ +name: "glm5_nvfp4_ISL1K_OSL1K_ctx6dep2_gen1dep32_batch256_eplb288_mtp0" + +# ctx: 6 prefill workers, EPLB: num_slots=288, TP2/EP2, max_batch=256 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256 +# concurrency: 8192 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 3 + prefill_workers: 6 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + layer_updates_per_iter: 1 + num_slots: 288 + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 00000000..8940ea72 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,135 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx12dep2_gen1dep16_batch32_eplb0_mtp2" + +# ctx: 12 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# concurrency: 666 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 6 + prefill_workers: 12 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 96 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml new file mode 100644 index 00000000..29eba0b3 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml @@ -0,0 +1,147 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx13dep2_gen1dep8_batch128_eplb0_mtp1" + +# ctx: 13 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128 +# concurrency: 1076 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 7 + prefill_workers: 13 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1076" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..f8fcdac9 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx15dep2_gen1dep32_batch16_eplb0_mtp3" + +# ctx: 15 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrency: 666 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 8 + prefill_workers: 15 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 00000000..775fa68f --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,139 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx18dep2_gen1dep16_batch64_eplb0_mtp1" + +# ctx: 18 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 9 + prefill_workers: 18 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 00000000..c457cce0 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,134 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen1tep8_batch16_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 1 decode worker, TP8/EP8 (MNNVL), max_batch=16 +# concurrency: 24 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "24" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..517cf361 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen2tep8_batch8_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 2 decode workers, TP8/EP8 (MNNVL), max_batch=8 +# concurrency: 22 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "22" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml new file mode 100644 index 00000000..20599c3f --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml @@ -0,0 +1,132 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 4 decode workers, TP8/EP8 (MNNVL), max_batch=4 +# concurrencies: 4 (batch1), 24 (batch4) + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x24" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml new file mode 100644 index 00000000..0037f722 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -0,0 +1,131 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen5tep4_batch1_eplb0_mtp3" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 5 decode workers, TP4/EP4, max_batch=1 +# concurrency: 5 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml new file mode 100644 index 00000000..6e233408 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml @@ -0,0 +1,131 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep2_gen1dep32_batch4_eplb0_mtp3" + +# ctx: 5 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, enable_lm_head_tp_in_adp=true, max_batch=4 +# concurrency: 180 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 3 + prefill_workers: 5 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml new file mode 100644 index 00000000..bd1cb583 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -0,0 +1,132 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx9dep2_gen1dep32_batch8_eplb0_mtp3" + +# ctx: 9 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8 +# concurrency: 333 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 5 + prefill_workers: 9 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..611aebb6 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml @@ -0,0 +1,133 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx12dep2_gen1dep16_batch64_eplb0_mtp0" + +# ctx: 12 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 +# concurrency: 1127 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 6 + prefill_workers: 12 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1127" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..831e703d --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx15dep2_gen1dep32_batch32_eplb0_mtp0" + +# ctx: 15 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 +# concurrency: 1229 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 8 + prefill_workers: 15 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..8ff2f420 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen2tep8_batch16_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 2 decode workers, TP8/EP8 (MNNVL), max_batch=16 +# concurrency: 42 +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "42" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..cc8faa11 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen4tep8_batch1_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 4 decode workers, TP8/EP8 (MNNVL), max_batch=1 +# concurrency: 4 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml new file mode 100644 index 00000000..06d02024 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -0,0 +1,125 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP2/EP2 +# gen: 5 decode workers, TP4/EP4, max_batch=4 +# concurrencies: 5 (batch1), 10 (batch2), 25 (batch4) — merged as 5x10x25 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5x10x25" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 00000000..ead937c9 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,141 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx20dep2_gen1dep16_batch128_eplb0_mtp0" + +# ctx: 20 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# concurrency: 2151 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 10 + prefill_workers: 20 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2151" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 00000000..e06ea268 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,130 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx2dep2_gen3tep8_batch32_eplb0_mtp0" + +# ctx: 2 prefill workers, TP2/EP2 +# gen: 3 decode workers, TP8/EP8 (MNNVL), max_batch=32 +# concurrency: 117 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "117" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..f4b3cc09 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,134 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx4dep2_gen3tep8_batch64_eplb0_mtp0" + +# ctx: 4 prefill workers, TP2/EP2 +# gen: 3 decode workers, TP8/EP8 (MNNVL), max_batch=64 +# concurrency: 231 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + allreduce_strategy: MNNVL + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "231" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..75f56785 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "glm5_nvfp4_ISL8K_OSL1K_ctx9dep2_gen1dep32_batch16_eplb0_mtp0" + +# ctx: 9 prefill workers, TP2/EP2 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 +# concurrency: 615 + +model: + path: "nvidia/GLM5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" + precision: "fp4" + +resources: + gpu_type: "gb300" + + prefill_nodes: 5 + prefill_workers: 9 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + MIMALLOC_PURGE_DELAY: "0" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + custom_tokenizer: "glm_moe_dsa" + use_chat_template: false + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false From b85ec1d7d269ced342f090c4bcecad797b289cbd Mon Sep 17 00:00:00 2001 From: Richard Huo Date: Mon, 27 Apr 2026 09:25:17 -0700 Subject: [PATCH 08/14] fix: add chat template to the glm5 tokenizer --- .../scripts/sa-bench/backend_request_func.py | 9 +++- .../scripts/sa-bench/benchmark_serving.py | 48 +++++++++++-------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py index 87f3f9ef..6590cc2a 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py @@ -603,7 +603,14 @@ def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrain if "extra_special_tokens" in config: init_kwargs["additional_special_tokens"] = config["extra_special_tokens"] - return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs) + tok = PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs) + + jinja_path = path / "chat_template.jinja" + if jinja_path.exists(): + tok.chat_template = jinja_path.read_text(encoding="utf-8") + print(f"[sa-bench] Loaded chat template from {jinja_path}", flush=True) + + return tok def get_tokenizer( diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py index a5ea6490..a2d6251b 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py +++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py @@ -341,44 +341,52 @@ def sample_random_requests( tokenizer: PreTrainedTokenizerBase, use_chat_template: bool = False, ) -> list[tuple[str, int, int]]: - prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist() if use_chat_template: - chat_template_dummy = tokenizer.apply_chat_template( - [{"role": "user", "content": "a"}], - add_generation_prompt=True, - tokenize=False, - ) - tokenized_chat_template_dummy = tokenizer.encode(chat_template_dummy, add_special_tokens=False) - chat_template_len = len(tokenized_chat_template_dummy) - 1 + chat_template_len = len(tokenizer.encode( + tokenizer.apply_chat_template( + [{"role": "user", "content": "a"}], + add_generation_prompt=True, + tokenize=False, + ), add_special_tokens=False, + )) - 1 input_len = input_len - chat_template_len input_lens = np.random.randint( - int(input_len * range_ratio), + int(input_len * range_ratio) if input_len > 1 else 1, input_len + 1, size=num_prompts, ) output_lens = np.random.randint( - int(output_len * range_ratio), + int(output_len * range_ratio) if output_len > 1 else 1, output_len + 1, size=num_prompts, ) offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) input_requests = [] - for i in range(num_prompts): - prompt = tokenizer.decode( - prefix_token_ids + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])] - ) - re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[: (prefix_len + input_lens[i])] - prompt = tokenizer.decode(re_encoded_sequence) - if use_chat_template: + + if use_chat_template: + for i in range(num_prompts): + origin_text = tokenizer.decode( + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(int(input_lens[i] * 1.5))] + ) + re_encoded_sequence = tokenizer.encode(origin_text, add_special_tokens=False)[: input_lens[i]] + prompt_text = tokenizer.decode(re_encoded_sequence) prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], + [{"role": "user", "content": prompt_text}], add_generation_prompt=True, tokenize=False, ) input_lens[i] += chat_template_len - - input_requests.append((prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None)) + input_requests.append((prompt, int(input_lens[i]), int(output_lens[i]), None)) + else: + prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist() + for i in range(num_prompts): + prompt = tokenizer.decode( + prefix_token_ids + [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])] + ) + re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[: (prefix_len + input_lens[i])] + prompt = tokenizer.decode(re_encoded_sequence) + input_requests.append((prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None)) return input_requests From 4e698b33c6e6c29e7144eaa4067eab47ab4d000e Mon Sep 17 00:00:00 2001 From: Yeswanth koti Date: Tue, 28 Apr 2026 19:44:43 -0400 Subject: [PATCH 09/14] Add GLM5 GB200 NVFP4 Apr-09 disagg recipes. (#61) * Add GLM5 GB200 NVFP4 Apr-09 disagg recipes. Include the updated 1K/1K and 8K/1K STP and MTP TensorRT-LLM Dynamo configs so submission testing can run on the latest GB200 parameter set. * Keep only Apr-09 GB200 configs and align YAML quoting. Remove legacy GB200 trtllm_dynamo recipes inherited from the submission base branch, and normalize concurrencies/custom_tokenizer fields to double-quoted style for consistency with existing GB300 recipes. * fix: enable chat template and 16x rounds for GB200 GLM5 configs Update GB200 GLM5 trtllm_dynamo recipes to set use_chat_template=true and num_prompts_mult=16 so sa-bench runs align with current submission benchmarking methodology. --- ..._gen1dep32_batch16_allconc_eplb0_mtp3.yaml | 85 +++----- ...4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml | 91 ++++----- .../ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml | 77 +++---- ...2dep4_gen1dep16_batch256_eplb256_mtp1.yaml | 144 ++++++------- ...tx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml} | 103 ++++------ ...3dep4_gen1dep32_batch128_eplb288_mtp1.yaml | 112 +++++----- ...tx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml} | 106 +++++----- ...ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml | 87 ++++---- ...ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml | 172 ++++++++++++++++ ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 115 +++++------ ...4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml} | 86 +++----- ...2dep4_gen1dep16_batch512_eplb256_mtp0.yaml | 193 ------------------ ...ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 95 ++++----- ...x3dep4_gen1dep32_batch128_eplb0_mtp0.yaml} | 127 +++++------- ...4dep4_gen1dep32_batch256_eplb288_mtp0.yaml | 144 ++++++------- ...tx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml | 95 ++++----- .../ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml | 84 +++----- .../ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml} | 101 ++++----- ... ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml} | 85 +++----- ... ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml} | 83 +++----- .../ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml | 119 +++++++++++ ...tx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml} | 113 ++++------ .../ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml | 81 +++----- ...ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml | 87 ++++---- ...ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml | 113 +++++----- ...11dep4_gen1dep16_batch128_eplb0_mtp0.yaml} | 115 +++++------ .../ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml | 91 ++++----- .../ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml | 111 ++++++++++ .../ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml | 107 ++++++++++ ... ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml} | 84 +++----- ... ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml} | 92 ++++----- .../ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml | 108 ++++++++++ .../ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml | 117 +++++++++++ ...tx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml} | 103 ++++------ ...ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml | 83 +++----- ...ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml | 116 +++++++++++ 36 files changed, 1996 insertions(+), 1829 deletions(-) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/{ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml => ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml} (60%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml => ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml} (60%) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/{ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml} (61%) delete mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/{ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml => ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml} (54%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml => ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml} (59%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/{ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml => ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml} (67%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/{ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml => ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml} (66%) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/{ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml => ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml} (56%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml => ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml} (59%) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml} (61%) rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml => ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml} (60%) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml rename recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/{ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml => ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml} (57%) create mode 100644 recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml index 68af65ee..ea57cd6e 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 -# concurrencies: 333 (batch8), 666 (batch16) - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_allconc_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -87,11 +73,11 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 + - 1 + - 2 + - 4 + - 8 + - 16 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -104,30 +90,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "333x666" - req_rate: "inf" + concurrencies: "333x615" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml index da187faf..f0eb3d82 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32 -# concurrencies: 8 (batch1), 44 (batch8), 192 (batch32) - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch32_allconc_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 4 decode_nodes: 8 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,9 +56,7 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -88,13 +73,13 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -107,30 +92,28 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "8x44x192" - req_rate: "inf" + concurrencies: "24x36x96x192" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml index a6121cd0..3f5f7d0e 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=1 -# concurrency: 10 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 5 decode_nodes: 5 gpus_per_decode: 4 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 4 moe_expert_parallel_size: 4 @@ -87,9 +73,7 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 + - 1 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -102,30 +86,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "10" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml index dc176b2d..c65ccf78 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch256_eplb256_mtp1.yaml @@ -1,49 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1" - -# ctx: 2 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256 -# EPLB: num_slots=256 -# concurrency: 4301 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb256_mtp1 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 2 prefill_workers: 2 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 4 gpus_per_decode: 16 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -70,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -88,41 +73,41 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 - - 136 - - 144 - - 152 - - 160 - - 168 - - 176 - - 184 - - 192 - - 200 - - 208 - - 216 - - 224 - - 232 - - 240 - - 248 - - 256 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -138,30 +123,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "4301" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml similarity index 60% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml index 21edc148..a2c2bbe5 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch32_eplb0_mtp2.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 -# concurrency: 666 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch32_eplb0_mtp2 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - - prefill_nodes: 1 - prefill_workers: 1 + gpu_type: gb200 + prefill_nodes: 2 + prefill_workers: 2 gpus_per_prefill: 4 - decode_workers: 1 - decode_nodes: 4 - gpus_per_decode: 16 - + decode_nodes: 8 + gpus_per_decode: 32 gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,10 +56,9 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 2 - decode: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 pipeline_parallel_size: 1 enable_attention_dp: true enable_lm_head_tp_in_adp: true @@ -87,49 +73,46 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true kv_cache_config: dtype: fp8 enable_block_reuse: false - free_gpu_memory_fraction: 0.7 + free_gpu_memory_fraction: 0.6 cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 2 - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "666" - req_rate: "inf" + concurrencies: "1229" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml index a7a1c790..9a180818 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch128_eplb288_mtp1.yaml @@ -1,49 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1" - -# ctx: 3 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 -# EPLB: num_slots=288 -# concurrency: 4301 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb288_mtp1 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 3 prefill_workers: 3 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -70,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -88,25 +73,25 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -122,30 +107,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "4301" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml similarity index 60% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml index 440a4f73..05cf4ff8 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx3dep4_gen1dep32_batch4_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx3dep4_gen1dep32_batch64_eplb0_mtp1.yaml @@ -1,47 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep32_batch4_eplb0_mtp3" - -# ctx: 3 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, max_batch=4, concurrency: 167 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch64_eplb0_mtp1 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 3 prefill_workers: 3 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -51,9 +39,9 @@ backend: disable_overlap_scheduler: true trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 2 - max_num_tokens: 16640 - max_seq_len: 8232 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 print_iter_log: true cuda_graph_config: null moe_config: @@ -67,8 +55,7 @@ backend: max_tokens_in_buffer: 16384 speculative_config: decoding_type: MTP - num_nextn_predict_layers: 3 - + num_nextn_predict_layers: 1 decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -77,18 +64,26 @@ backend: enable_lm_head_tp_in_adp: true trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 4 - max_num_tokens: 16 - max_seq_len: 9256 + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 2088 print_iter_log: true stream_interval: 100 num_postprocess_workers: 4 cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -101,30 +96,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP - num_nextn_predict_layers: 3 - + num_nextn_predict_layers: 1 benchmark: - type: "sa-bench" - isl: 8192 + type: sa-bench + isl: 1024 osl: 1024 - concurrencies: "167" - req_rate: "inf" + concurrencies: "2151" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml index 7412a109..27dc86c3 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 -# concurrency: 1229 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch32_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -84,13 +70,13 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -103,27 +89,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "1229" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 00000000..ddf38c05 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,172 @@ +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch512_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml index e969c07d..7d26a743 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=128 -# Merged concurrencies: batch1(4), batch32(180), batch64(360), batch128(616) - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 4 decode_nodes: 8 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,9 +53,7 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -85,25 +70,25 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -116,27 +101,25 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "4x180x360x616" - req_rate: "inf" + concurrencies: "84x168x336x616" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml similarity index 61% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml index fb583747..0a19b8b4 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8 -# Merged concurrencies: batch1(5), batch2(15), batch4(30), batch8(50) - +name: glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch4_allconc_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 5 decode_nodes: 5 gpus_per_decode: 4 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 4 moe_expert_parallel_size: 4 @@ -75,8 +61,8 @@ backend: enable_lm_head_tp_in_adp: false trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 8 - max_num_tokens: 8 + max_batch_size: 4 + max_num_tokens: 4 max_seq_len: 2088 print_iter_log: true stream_interval: 100 @@ -84,10 +70,9 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 + - 1 + - 2 + - 4 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -100,27 +85,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "5x15x30x50" - req_rate: "inf" + concurrencies: "5x15x25" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml deleted file mode 100644 index d221dde2..00000000 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch512_eplb256_mtp0.yaml +++ /dev/null @@ -1,193 +0,0 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch512_eplb256_mtp0" - -# ctx: 2 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=512 -# EPLB: num_slots=256 -# concurrency: 8192 - -model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - -resources: - gpu_type: "gb200" - - prefill_nodes: 2 - prefill_workers: 2 - gpus_per_prefill: 4 - - decode_workers: 1 - decode_nodes: 4 - gpus_per_decode: 16 - - gpus_per_node: 4 - -backend: - type: trtllm - - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - - decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - - trtllm_config: - prefill: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - enable_attention_dp: true - disable_overlap_scheduler: true - trust_remote_code: true - custom_tokenizer: "glm_moe_dsa" - max_batch_size: 16 - max_num_tokens: 16384 - max_seq_len: 1064 - print_iter_log: true - cuda_graph_config: null - moe_config: - backend: CUTEDSL - kv_cache_config: - dtype: fp8 - enable_block_reuse: false - free_gpu_memory_fraction: 0.6 - cache_transceiver_config: - backend: UCX - max_tokens_in_buffer: 16384 - - decode: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - pipeline_parallel_size: 1 - enable_attention_dp: true - enable_lm_head_tp_in_adp: false - trust_remote_code: true - custom_tokenizer: "glm_moe_dsa" - max_batch_size: 512 - max_num_tokens: 512 - max_seq_len: 2088 - print_iter_log: true - stream_interval: 100 - num_postprocess_workers: 4 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 - - 136 - - 144 - - 152 - - 160 - - 168 - - 176 - - 184 - - 192 - - 200 - - 208 - - 216 - - 224 - - 232 - - 240 - - 248 - - 256 - - 264 - - 272 - - 280 - - 288 - - 296 - - 304 - - 312 - - 320 - - 328 - - 336 - - 344 - - 352 - - 360 - - 368 - - 376 - - 384 - - 392 - - 400 - - 408 - - 416 - - 424 - - 432 - - 440 - - 448 - - 456 - - 464 - - 472 - - 480 - - 488 - - 496 - - 504 - - 512 - moe_config: - backend: CUTEDSL - use_low_precision_moe_combine: true - load_balancer: - layer_updates_per_iter: 1 - num_slots: 256 - kv_cache_config: - dtype: fp8 - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - cache_transceiver_config: - backend: UCX - max_tokens_in_buffer: 16384 - nvfp4_gemm_config: - allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8192" - req_rate: "inf" - custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - -frontend: - type: "dynamo" - enable_multiple_frontends: false - -health_check: - max_attempts: 360 - interval_seconds: 10 - -dynamo: - install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml index bbad79c1..cfecb846 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0" - -# ctx: 2 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 -# concurrency: 2253 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch64_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 2 prefill_workers: 2 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -84,17 +70,17 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -107,27 +93,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "2253" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml similarity index 54% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml index e057ce05..7430fdb3 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx3dep4_gen1dep32_batch128_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch128_eplb0_mtp0" - -# ctx: 2 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 -# concurrency: 2253 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx3dep4_gen1dep32_batch128_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - - prefill_nodes: 2 - prefill_workers: 2 + gpu_type: gb200 + prefill_nodes: 3 + prefill_workers: 3 gpus_per_prefill: 4 - decode_workers: 1 - decode_nodes: 4 - gpus_per_decode: 16 - + decode_nodes: 8 + gpus_per_decode: 32 gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,10 +53,9 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 pipeline_parallel_size: 1 enable_attention_dp: true enable_lm_head_tp_in_adp: false @@ -84,58 +70,55 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true kv_cache_config: dtype: fp8 enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.7 cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 - concurrencies: "2253" - req_rate: "inf" + concurrencies: "4301" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml index 26d2d29e..4d136e16 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx4dep4_gen1dep32_batch256_eplb288_mtp0.yaml @@ -1,49 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0" - -# ctx: 4 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=256 -# EPLB: num_slots=288 -# concurrency: 8192 - +name: glm5_nvfp4_ISL1K_OSL1K_ctx4dep4_gen1dep32_batch256_eplb288_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 4 prefill_workers: 4 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -67,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -85,41 +70,41 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 - - 136 - - 144 - - 152 - - 160 - - 168 - - 176 - - 184 - - 192 - - 200 - - 208 - - 216 - - 224 - - 232 - - 240 - - 248 - - 256 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -135,27 +120,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 1024 osl: 1024 concurrencies: "8192" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml index 420192c2..55ccb8ce 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx10dep4_gen1dep16_batch64_eplb0_mtp1.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1" - -# ctx: 10 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 -# concurrency: 1229 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch64_eplb0_mtp1 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 10 prefill_workers: 10 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 4 gpus_per_decode: 16 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -87,17 +73,17 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -110,30 +96,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "1229" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml index da3186e5..e585b7d7 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch16_eplb0_mtp3.yaml @@ -1,47 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 2 decode workers, TP8/EP8, max_batch=16, concurrency: 46 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch16_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 2 decode_nodes: 4 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -68,9 +56,7 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -87,11 +73,11 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 + - 1 + - 2 + - 4 + - 8 + - 16 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -104,30 +90,28 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "46" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml similarity index 59% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml index d6d3dcf1..9d93a18c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch16_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp2" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 4 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=16 -# concurrency: 96 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 4 decode_nodes: 8 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -52,9 +39,9 @@ backend: disable_overlap_scheduler: true trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 16 - max_num_tokens: 16384 - max_seq_len: 1064 + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 print_iter_log: true cuda_graph_config: null moe_config: @@ -68,10 +55,8 @@ backend: max_tokens_in_buffer: 16384 speculative_config: decoding_type: MTP - num_nextn_predict_layers: 2 - + num_nextn_predict_layers: 3 decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -79,20 +64,16 @@ backend: enable_lm_head_tp_in_adp: false trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 16 - max_num_tokens: 48 - max_seq_len: 2088 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 print_iter_log: true stream_interval: 100 num_postprocess_workers: 4 cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 + - 1 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -105,30 +86,28 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP - num_nextn_predict_layers: 2 - + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" - isl: 1024 + type: sa-bench + isl: 8192 osl: 1024 - concurrencies: "96" - req_rate: "inf" + concurrencies: "8" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml similarity index 67% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml index fb94a549..7326e4bd 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch8_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_allconc_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 4 decode workers, TP8/EP8, max_batch=8 -# concurrencies: 4 (batch1), 48 (batch8) - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch8_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 4 decode_nodes: 8 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,9 +56,7 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -88,10 +73,10 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 + - 1 + - 2 + - 4 + - 8 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -104,30 +89,28 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "4x48" - req_rate: "inf" + concurrencies: "48" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml similarity index 66% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml index 0a13cce4..8d33decc 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_eplb0_mtp3.yaml @@ -1,47 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp3" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 5 decode workers, TP4/EP4, max_batch=1, concurrency: 5 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch2_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 5 decode_nodes: 5 gpus_per_decode: 4 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -68,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 4 moe_expert_parallel_size: 4 @@ -77,8 +64,8 @@ backend: enable_lm_head_tp_in_adp: false trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 1 - max_num_tokens: 4 + max_batch_size: 2 + max_num_tokens: 8 max_seq_len: 9256 print_iter_log: true stream_interval: 100 @@ -86,9 +73,8 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 + - 1 + - 2 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -101,30 +87,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "5" - req_rate: "inf" + concurrencies: "15" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 00000000..b27d7ddd --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen3tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx2dep4_gen3tep8_batch32_eplb0_mtp3 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "144" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml similarity index 56% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml index ebcd45d1..9f1be846 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep16_batch16_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64 -# concurrency: 1229 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep16_batch16_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - - prefill_nodes: 1 - prefill_workers: 1 + gpu_type: gb200 + prefill_nodes: 5 + prefill_workers: 5 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 4 gpus_per_decode: 16 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -52,9 +39,9 @@ backend: disable_overlap_scheduler: true trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 16 - max_num_tokens: 16384 - max_seq_len: 1064 + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 print_iter_log: true cuda_graph_config: null moe_config: @@ -68,8 +55,7 @@ backend: max_tokens_in_buffer: 16384 speculative_config: decoding_type: MTP - num_nextn_predict_layers: 1 - + num_nextn_predict_layers: 3 decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -78,26 +64,20 @@ backend: enable_lm_head_tp_in_adp: true trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 64 - max_num_tokens: 128 - max_seq_len: 2088 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 print_iter_log: true stream_interval: 100 num_postprocess_workers: 4 cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 + - 1 + - 2 + - 4 + - 8 + - 16 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -110,30 +90,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP - num_nextn_predict_layers: 1 - + num_nextn_predict_layers: 3 benchmark: - type: "sa-bench" - isl: 1024 + type: sa-bench + isl: 8192 osl: 1024 - concurrencies: "1229" - req_rate: "inf" + concurrencies: "333" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml index 492f1b4c..54842280 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch8_eplb0_mtp3.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3" - -# ctx: 5 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=8 -# concurrency: 333 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch8_eplb0_mtp3 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 5 prefill_workers: 5 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -87,10 +73,10 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 + - 1 + - 2 + - 4 + - 8 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -103,30 +89,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "333" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml index d22fbcf1..ab957385 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep16_batch32_eplb0_mtp2.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2" - -# ctx: 7 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 -# concurrency: 615 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch32_eplb0_mtp2 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 7 prefill_workers: 7 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 4 gpus_per_decode: 16 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 2 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -87,13 +73,13 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -106,30 +92,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 2 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "615" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml index 804e89b5..9182158a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/MTP/ctx7dep4_gen1dep8_batch128_eplb0_mtp1.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1" - -# ctx: 7 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=128 -# concurrency: 1076 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep8_batch128_eplb0_mtp1 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 7 prefill_workers: 7 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 2 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -69,7 +56,6 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - decode: tensor_parallel_size: 8 moe_expert_parallel_size: 8 @@ -87,25 +73,25 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -118,30 +104,27 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core + - cutlass + - cublaslt + - cutedsl + - cuda_core speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "1076" - req_rate: "inf" + concurrencies: "1127" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml similarity index 59% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml index 0fa8566d..ca299465 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx10dep4_gen1dep16_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx11dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx10dep4_gen1dep16_batch128_eplb0_mtp0" - -# ctx: 10 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 -# concurrency: 2253 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx11dep4_gen1dep16_batch128_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - - prefill_nodes: 10 - prefill_workers: 10 + gpu_type: gb200 + prefill_nodes: 11 + prefill_workers: 11 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 4 gpus_per_decode: 16 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -84,25 +70,25 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -115,27 +101,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "2253" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml index 478f6203..e857d27e 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen2tep8_batch32_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 2 decode workers, TP8/EP8, enable_attention_dp=false, max_batch=32 -# concurrency: 84 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen2tep8_batch32_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 2 decode_nodes: 4 gpus_per_decode: 8 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,9 +53,7 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: - allreduce_strategy: MNNVL tensor_parallel_size: 8 moe_expert_parallel_size: 8 pipeline_parallel_size: 1 @@ -85,13 +70,13 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -104,27 +89,25 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "84" - req_rate: "inf" + concurrencies: "78" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 00000000..6281c402 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch16_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "84" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 00000000..7d9d1002 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,107 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml similarity index 61% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml index 90e62af3..f2527276 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 5 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=8 -# Merged concurrencies: batch1(5), batch2(10), batch4(25), batch8(50) - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch1_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_workers: 5 decode_nodes: 5 gpus_per_decode: 4 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 4 moe_expert_parallel_size: 4 @@ -75,8 +61,8 @@ backend: enable_lm_head_tp_in_adp: false trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 8 - max_num_tokens: 8 + max_batch_size: 1 + max_num_tokens: 1 max_seq_len: 9256 print_iter_log: true stream_interval: 100 @@ -84,10 +70,7 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 + - 1 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -100,27 +83,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "5x10x25x50" - req_rate: "inf" + concurrencies: "5" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml similarity index 60% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml index 462401b6..b2217d07 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen3tep4_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen3tep4_batch32_eplb0_mtp0" - -# ctx: 1 prefill worker, TP4/EP4 -# gen: 3 decode workers, TP4/EP4, enable_attention_dp=false, max_batch=32 -# concurrency: 117 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch2_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - - decode_workers: 3 - decode_nodes: 3 + decode_workers: 5 + decode_nodes: 5 gpus_per_decode: 4 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 4 moe_expert_parallel_size: 4 @@ -75,8 +61,8 @@ backend: enable_lm_head_tp_in_adp: false trust_remote_code: true custom_tokenizer: "glm_moe_dsa" - max_batch_size: 32 - max_num_tokens: 32 + max_batch_size: 2 + max_num_tokens: 2 max_seq_len: 9256 print_iter_log: true stream_interval: 100 @@ -84,13 +70,8 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 moe_config: backend: TRTLLM use_low_precision_moe_combine: true @@ -103,27 +84,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "117" - req_rate: "inf" + concurrencies: "10" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml new file mode 100644 index 00000000..9ddc2efd --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml @@ -0,0 +1,108 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch4_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "25" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..f5abf5a0 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen3tep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,117 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx2dep4_gen3tep8_batch64_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "231" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml similarity index 57% rename from recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml rename to recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml index 7e34b6d9..0a62b740 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx8dep4_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx8dep4_gen1dep32_batch32_eplb0_mtp0" - -# ctx: 8 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=32 -# concurrency: 1229 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx4dep4_gen1dep16_batch32_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - - prefill_nodes: 8 - prefill_workers: 8 + gpu_type: gb200 + prefill_nodes: 4 + prefill_workers: 4 gpus_per_prefill: 4 - decode_workers: 1 - decode_nodes: 8 - gpus_per_decode: 32 - + decode_nodes: 4 + gpus_per_decode: 16 gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,10 +53,9 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 pipeline_parallel_size: 1 enable_attention_dp: true enable_lm_head_tp_in_adp: false @@ -84,46 +70,43 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true kv_cache_config: dtype: fp8 enable_block_reuse: false - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.8 cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "1229" - req_rate: "inf" + concurrencies: "564" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml index 7a6ece31..7da85327 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep32_batch16_eplb0_mtp0.yaml @@ -1,48 +1,35 @@ -name: "glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0" - -# ctx: 5 prefill workers, TP4/EP4 -# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16 -# concurrency: 615 - +name: glm5_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep32_batch16_eplb0_mtp0 model: - path: "nvidia/GLM5-NVFP4" - container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" - precision: "fp4" - + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 resources: - gpu_type: "gb200" - + gpu_type: gb200 prefill_nodes: 5 prefill_workers: 5 gpus_per_prefill: 4 - decode_workers: 1 decode_nodes: 8 gpus_per_decode: 32 - gpus_per_node: 4 - backend: type: trtllm - prefill_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' decode_environment: - ENROOT_ALLOW_DEV: "yes" - MIMALLOC_PURGE_DELAY: "0" - NCCL_GRAPH_MIXING_SUPPORT: "0" - TLLM_LOG_LEVEL: "INFO" - TRTLLM_ENABLE_PDL: "1" - TRTLLM_SERVER_DISABLE_GC: "1" - TRTLLM_WORKER_DISABLE_GC: "1" - + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' trtllm_config: prefill: tensor_parallel_size: 4 @@ -66,7 +53,6 @@ backend: cache_transceiver_config: backend: UCX max_tokens_in_buffer: 16384 - decode: tensor_parallel_size: 32 moe_expert_parallel_size: 32 @@ -84,11 +70,11 @@ backend: cuda_graph_config: enable_padding: true batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 + - 1 + - 2 + - 4 + - 8 + - 16 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -101,27 +87,24 @@ backend: max_tokens_in_buffer: 16384 nvfp4_gemm_config: allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - + - cutlass + - cublaslt + - cutedsl + - cuda_core benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 concurrencies: "615" - req_rate: "inf" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true custom_tokenizer: "glm_moe_dsa" - use_chat_template: false - frontend: - type: "dynamo" + type: dynamo enable_multiple_frontends: false - health_check: max_attempts: 360 interval_seconds: 10 - dynamo: install: false diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml new file mode 100644 index 00000000..e4a4b431 --- /dev/null +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch64_eplb0_mtp0.yaml @@ -0,0 +1,116 @@ +name: glm5_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch64_eplb0_mtp0 +model: + path: nvidia/GLM5-NVFP4 + container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3 + precision: fp4 +resources: + gpu_type: gb200 + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + gpus_per_node: 4 +backend: + type: trtllm + prefill_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + decode_environment: + ENROOT_ALLOW_DEV: 'yes' + MIMALLOC_PURGE_DELAY: '0' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: CUTEDSL + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + custom_tokenizer: "glm_moe_dsa" + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: "1127" + req_rate: inf + num_prompts_mult: 16 + use_chat_template: true + custom_tokenizer: "glm_moe_dsa" +frontend: + type: dynamo + enable_multiple_frontends: false +health_check: + max_attempts: 360 + interval_seconds: 10 +dynamo: + install: false From c88c68ea6b819d52357d2fed3055c6b0178badfb Mon Sep 17 00:00:00 2001 From: Yeswanth koti Date: Tue, 28 Apr 2026 19:45:19 -0400 Subject: [PATCH 10/14] fix: align glm5 gb300 sa-bench rounds with submission baselines (#113) Set GLM5 GB300 trtllm_dynamo recipes to use chat template and num_prompts_mult=16 so throughput runs match TRTLLM multi-round methodology, while keeping warmup fixed at 2x. --- .../ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 3 ++- .../MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml | 3 ++- .../ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 3 ++- .../ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 3 ++- .../MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml | 3 ++- .../MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 3 ++- .../MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml | 3 ++- .../MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml | 3 ++- .../MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml | 3 ++- .../MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml | 3 ++- .../STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 3 ++- .../STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml | 3 ++- .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 3 ++- .../STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 3 ++- .../STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml | 3 ++- .../STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml | 3 ++- .../STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml | 3 ++- .../STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml | 3 ++- .../MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml | 3 ++- .../MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml | 3 ++- .../MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 3 ++- .../MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml | 3 ++- .../ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 3 ++- .../ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml | 3 ++- .../MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml | 3 ++- .../ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 3 ++- .../ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 3 ++- .../ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 3 ++- .../STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml | 3 ++- .../STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 3 ++- .../ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 3 ++- .../ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml | 3 ++- .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 3 ++- .../STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml | 3 ++- .../ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 3 ++- .../ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 3 ++- .../STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 3 ++- src/srtctl/benchmarks/scripts/sa-bench/bench.sh | 4 ++-- 38 files changed, 76 insertions(+), 39 deletions(-) diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml index 80aacc6a..0b2b3771 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -117,8 +117,9 @@ benchmark: osl: 1024 concurrencies: "333" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml index 648ec949..3a2447d2 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml @@ -119,8 +119,9 @@ benchmark: osl: 1024 concurrencies: "24x44x92" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml index 823624ac..671ba92a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml @@ -121,8 +121,9 @@ benchmark: osl: 1024 concurrencies: "180" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml index 64b61b9f..d916e313 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -116,8 +116,9 @@ benchmark: osl: 1024 concurrencies: "10" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml index 66d211aa..821b1a1d 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml @@ -124,8 +124,9 @@ benchmark: osl: 1024 concurrencies: "1229" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml index fe754372..cd7d2abc 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -118,8 +118,9 @@ benchmark: osl: 1024 concurrencies: "666" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml index 70821f3e..7fff09e1 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml @@ -120,8 +120,9 @@ benchmark: osl: 1024 concurrencies: "1229" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml index bf3183b7..36dd9e05 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml @@ -151,8 +151,9 @@ benchmark: osl: 1024 concurrencies: "4301" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml index 1d9f4f10..80338977 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml @@ -180,8 +180,9 @@ benchmark: osl: 1024 concurrencies: "8602" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml index 44b81b3c..42592680 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml @@ -135,8 +135,9 @@ benchmark: osl: 1024 concurrencies: "4301" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml index 0410623b..f090f3db 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -112,8 +112,9 @@ benchmark: osl: 1024 concurrencies: "615" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml index d967e3b2..d9cc7807 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml @@ -119,8 +119,9 @@ benchmark: osl: 1024 concurrencies: "84x180x336" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml index d9f9ea2f..c50c85e9 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -110,8 +110,9 @@ benchmark: osl: 1024 concurrencies: "5x10x25" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml index 26ddd7b1..9ec32b8a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -114,8 +114,9 @@ benchmark: osl: 1024 concurrencies: "1229" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml index 081e96da..2887005e 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml @@ -118,8 +118,9 @@ benchmark: osl: 1024 concurrencies: "2253" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml index dbca4fd5..ba0e1063 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml @@ -176,8 +176,9 @@ benchmark: osl: 1024 concurrencies: "8192" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml index 1c8d2d78..81f0cac3 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml @@ -126,8 +126,9 @@ benchmark: osl: 1024 concurrencies: "4301" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml index 0d6870ff..fbb91775 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml @@ -145,8 +145,9 @@ benchmark: osl: 1024 concurrencies: "8192" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml index 8940ea72..ecb7bb5c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml @@ -120,8 +120,9 @@ benchmark: osl: 1024 concurrencies: "666" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml index 29eba0b3..956b5de6 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml @@ -132,8 +132,9 @@ benchmark: osl: 1024 concurrencies: "1076" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml index f8fcdac9..21f11cd5 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -118,8 +118,9 @@ benchmark: osl: 1024 concurrencies: "666" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml index 775fa68f..358e4b7b 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml @@ -124,8 +124,9 @@ benchmark: osl: 1024 concurrencies: "1229" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml index c457cce0..e3e2d993 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml @@ -119,8 +119,9 @@ benchmark: osl: 1024 concurrencies: "24" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml index 517cf361..649566d8 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml @@ -118,8 +118,9 @@ benchmark: osl: 1024 concurrencies: "22" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml index 20599c3f..e0a55bf5 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml @@ -117,8 +117,9 @@ benchmark: osl: 1024 concurrencies: "4x24" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml index 0037f722..6deefd9f 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -116,8 +116,9 @@ benchmark: osl: 1024 concurrencies: "5" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml index 6e233408..5bdbb4d9 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml @@ -116,8 +116,9 @@ benchmark: osl: 1024 concurrencies: "180" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml index bd1cb583..acef0e9e 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -117,8 +117,9 @@ benchmark: osl: 1024 concurrencies: "333" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml index 611aebb6..5fcd82c6 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml @@ -118,8 +118,9 @@ benchmark: osl: 1024 concurrencies: "1127" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml index 831e703d..27122e16 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -114,8 +114,9 @@ benchmark: osl: 1024 concurrencies: "1229" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml index 8ff2f420..d8386092 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml @@ -112,8 +112,9 @@ benchmark: osl: 1024 concurrencies: "42" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml index cc8faa11..0c05b922 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml @@ -111,8 +111,9 @@ benchmark: osl: 1024 concurrencies: "4" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml index 06d02024..a0ed195c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -110,8 +110,9 @@ benchmark: osl: 1024 concurrencies: "5x10x25" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml index ead937c9..55bf1ae4 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml @@ -126,8 +126,9 @@ benchmark: osl: 1024 concurrencies: "2151" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml index e06ea268..1836bb52 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml @@ -115,8 +115,9 @@ benchmark: osl: 1024 concurrencies: "117" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml index f4b3cc09..4a022256 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml @@ -119,8 +119,9 @@ benchmark: osl: 1024 concurrencies: "231" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml index 75f56785..7ea9a2f6 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -112,8 +112,9 @@ benchmark: osl: 1024 concurrencies: "615" req_rate: "inf" + num_prompts_mult: 16 custom_tokenizer: "glm_moe_dsa" - use_chat_template: false + use_chat_template: true frontend: type: "dynamo" diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh index acddf754..154d7590 100644 --- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh +++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh @@ -83,7 +83,7 @@ PORT=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f2 | cut -d/ -f1) WORK_DIR="$(dirname "$0")" -echo "SA-Bench Config: endpoint=${ENDPOINT}; isl=${ISL}; osl=${OSL}; concurrencies=${CONCURRENCIES}; req_rate=${REQ_RATE}; model=${MODEL_NAME}" +echo "SA-Bench Config: endpoint=${ENDPOINT}; isl=${ISL}; osl=${OSL}; concurrencies=${CONCURRENCIES}; req_rate=${REQ_RATE}; model=${MODEL_NAME}; num_prompts_mult=${NUM_PROMPTS_MULT}; num_warmup_mult=${NUM_WARMUP_MULT}" # Profiling shared helpers SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -138,7 +138,7 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do --trust-remote-code \ "${CUSTOM_TOKENIZER_ARGS[@]}" - num_prompts=$((concurrency * 10)) + num_prompts=$((concurrency * NUM_PROMPTS_MULT)) # Generate result filename based on mode if [ "$IS_DISAGGREGATED" = "true" ]; then From 95b0a33dafeb46886977648af8c9247938d88500 Mon Sep 17 00:00:00 2001 From: Richard Huo Date: Wed, 29 Apr 2026 12:34:03 -0700 Subject: [PATCH 11/14] fix: using a setup script to install pip in trtllm venv # (#117) --- configs/install-trtllm-pip.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 configs/install-trtllm-pip.sh diff --git a/configs/install-trtllm-pip.sh b/configs/install-trtllm-pip.sh new file mode 100755 index 00000000..2cfa2df8 --- /dev/null +++ b/configs/install-trtllm-pip.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# WRN to install pip in the dynamo trtllm runtime image's venv + +uv pip install pip From 0f0aa605525f76fad70a7a229151e0cdd2c8acf6 Mon Sep 17 00:00:00 2001 From: Yeswanth koti Date: Wed, 29 Apr 2026 17:06:12 -0400 Subject: [PATCH 12/14] fix: add trtllm venv pip bootstrap to GB300 GLM5 recipes (#120) Add setup_script install-trtllm-pip.sh to all GB300 GLM5 trtllm_dynamo recipes so eval-only jobs can install lm-eval even when pip is missing in the runtime container venv. --- .../ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 2 ++ .../MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml | 2 ++ .../ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml | 2 ++ .../ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 2 ++ .../MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml | 2 ++ .../MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 2 ++ .../MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml | 2 ++ .../MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml | 2 ++ .../MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml | 2 ++ .../MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml | 2 ++ .../STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 2 ++ .../STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml | 2 ++ .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 2 ++ .../STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 2 ++ .../STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml | 2 ++ .../STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml | 4 +++- .../STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml | 2 ++ .../STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml | 2 ++ .../MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml | 2 ++ .../MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml | 2 ++ .../MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml | 2 ++ .../MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml | 2 ++ .../ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml | 4 +++- .../ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml | 2 ++ .../MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml | 4 +++- .../ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml | 2 ++ .../ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml | 2 ++ .../ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml | 2 ++ .../STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml | 2 ++ .../STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml | 2 ++ .../ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml | 4 +++- .../ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml | 2 ++ .../STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml | 2 ++ .../STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml | 4 +++- .../ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml | 2 ++ .../ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml | 2 ++ .../STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml | 2 ++ 37 files changed, 79 insertions(+), 5 deletions(-) diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml index 0b2b3771..5483257c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml index 3a2447d2..68ce1ced 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch16_allconc_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml index 671ba92a..05d57bf8 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen4tep8_batch32_eplb0_mtp2.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml index d916e313..076c7643 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml index 821b1a1d..08dcf1a4 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep16_batch64_eplb0_mtp2.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml index cd7d2abc..930a79d2 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx2dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml index 7fff09e1..63417d84 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx3dep2_gen1dep32_batch32_eplb0_mtp2.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml index 36dd9e05..81863a5b 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx4dep2_gen1dep16_batch256_eplb256_mtp1.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml index 80338977..3c6551d0 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx5dep2_gen2dep8_batch512_eplb0_mtp1.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml index 42592680..7d613f26 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/MTP/ctx6dep2_gen1dep32_batch128_eplb288_mtp1.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml index f090f3db..539fd2c6 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml index d9cc7807..19e92ad0 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_allconc_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml index c50c85e9..cf5fc790 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml index 9ec32b8a..37d8cc94 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml index 2887005e..7477b620 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml index ba0e1063..ec55cf3c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml @@ -7,7 +7,9 @@ model: path: "nvidia/GLM5-NVFP4" container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" - + +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml index 81f0cac3..31a8591a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep32_batch128_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml index fbb91775..c057ebb2 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx6dep2_gen1dep32_batch256_eplb288_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml index ecb7bb5c..1c95cd35 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx12dep2_gen1dep16_batch32_eplb0_mtp2.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml index 956b5de6..bbb5197a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx13dep2_gen1dep8_batch128_eplb0_mtp1.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml index 21f11cd5..9b013f4f 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx15dep2_gen1dep32_batch16_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml index 358e4b7b..20fca94c 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx18dep2_gen1dep16_batch64_eplb0_mtp1.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml index e3e2d993..43927017 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen1tep8_batch16_eplb0_mtp3.yaml @@ -8,7 +8,9 @@ model: path: "nvidia/GLM5-NVFP4" container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" - + +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml index 649566d8..d92652b8 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen2tep8_batch8_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml index e0a55bf5..9b9539c0 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen4tep8_batch4_allconc_eplb0_mtp3.yaml @@ -8,7 +8,9 @@ model: path: "nvidia/GLM5-NVFP4" container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" - + +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml index 6deefd9f..2612cb70 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx1dep2_gen5tep4_batch1_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml index 5bdbb4d9..34d90b87 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx5dep2_gen1dep32_batch4_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml index acef0e9e..2a136d0a 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/MTP/ctx9dep2_gen1dep32_batch8_eplb0_mtp3.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml index 5fcd82c6..bce80c02 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx12dep2_gen1dep16_batch64_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml index 27122e16..91ef3af7 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep32_batch32_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml index d8386092..48979338 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen2tep8_batch16_eplb0_mtp0.yaml @@ -7,7 +7,9 @@ model: path: "nvidia/GLM5-NVFP4" container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" - + +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml index 0c05b922..3328694f 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml index a0ed195c..06dc30af 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_allconc_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml index 55bf1ae4..dc600b4f 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx20dep2_gen1dep16_batch128_eplb0_mtp0.yaml @@ -8,7 +8,9 @@ model: path: "nvidia/GLM5-NVFP4" container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" - + +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml index 1836bb52..9b27d13b 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx2dep2_gen3tep8_batch32_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml index 4a022256..d9f0d7fa 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx4dep2_gen3tep8_batch64_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" diff --git a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml index 7ea9a2f6..53877eb2 100644 --- a/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml +++ b/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep32_batch16_eplb0_mtp0.yaml @@ -9,6 +9,8 @@ model: container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.3" precision: "fp4" +setup_script: "install-trtllm-pip.sh" + resources: gpu_type: "gb300" From fdb1be7ade5c7dec77d716d02e8869177113593d Mon Sep 17 00:00:00 2001 From: Richard Huo Date: Wed, 29 Apr 2026 21:46:47 -0700 Subject: [PATCH 13/14] run setup script before post eval (#123) --- src/srtctl/cli/do_sweep.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 77b79ac5..96c90462 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -274,6 +274,14 @@ def _run_post_eval(self, stop_event: threading.Event) -> int: env_to_set["EVAL_CONC"] = str(max(conc_list)) logger.info("Eval concurrency (max of %s): %s", conc_list, env_to_set["EVAL_CONC"]) + bash_preamble = None + if self.config.setup_script: + script_path = f"/configs/{self.config.setup_script}" + bash_preamble = ( + f"echo 'Running setup script: {script_path}' && " + f"if [ -f '{script_path}' ]; then bash '{script_path}'; else echo 'WARNING: {script_path} not found'; fi" + ) + proc = start_srun_process( command=cmd, nodelist=[self.runtime.nodes.head], @@ -281,6 +289,7 @@ def _run_post_eval(self, stop_event: threading.Event) -> int: container_image=str(self.runtime.container_image), container_mounts=self.runtime.container_mounts, env_to_set=env_to_set, + bash_preamble=bash_preamble, ) while proc.poll() is None: From 9ecc31f1f22c0fd56aa6eee9e1933692d284912f Mon Sep 17 00:00:00 2001 From: Jason Li Date: Thu, 28 May 2026 17:05:44 -0400 Subject: [PATCH 14/14] Add spread worker placement and vLLM colocation (#181) * Add spread_workers option to ResourceConfig Allow placing each partial-node worker on its own node instead of packing multiple onto the same node. Useful when colocating workers on a single node causes resource contention (port collisions, etc.). Caller must reserve enough nodes (e.g. set decode_nodes=decode_workers when gpus_per_decode * try fix * allow multiple DEP2 workers per node * multi worker fix * Allow vLLM one-node prefill decode colocation * Avoid same-node worker port collisions * Fix spread workers tests and lint * Cover vLLM colocation guard --------- Co-authored-by: Claude Opus 4.7 (1M context) Co-authored-by: hjjq <50634613+hjjq@users.noreply.github.com> --- src/srtctl/backends/base.py | 1 + src/srtctl/backends/sglang.py | 2 + src/srtctl/backends/trtllm.py | 2 + src/srtctl/backends/vllm.py | 58 ++++++- src/srtctl/cli/do_sweep.py | 1 + src/srtctl/cli/submit.py | 2 +- src/srtctl/core/schema.py | 20 +++ src/srtctl/core/topology.py | 59 +++++-- tests/test_configs.py | 260 ++++++++++++++++++++++++++++++ tests/test_endpoint_allocation.py | 53 +++++- 10 files changed, 437 insertions(+), 21 deletions(-) diff --git a/src/srtctl/backends/base.py b/src/srtctl/backends/base.py index 62904ff1..f8b6e815 100644 --- a/src/srtctl/backends/base.py +++ b/src/srtctl/backends/base.py @@ -82,6 +82,7 @@ def allocate_endpoints( gpus_per_agg: int, gpus_per_node: int, available_nodes: Sequence[str], + spread_workers: bool = False, ) -> list["Endpoint"]: """Allocate logical endpoints based on resource requirements.""" ... diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py index 1f4b818d..84935744 100644 --- a/src/srtctl/backends/sglang.py +++ b/src/srtctl/backends/sglang.py @@ -179,6 +179,7 @@ def allocate_endpoints( gpus_per_agg: int, gpus_per_node: int, available_nodes: Sequence[str], + spread_workers: bool = False, ) -> list["Endpoint"]: """Allocate endpoints to nodes.""" from srtctl.core.topology import allocate_endpoints @@ -192,6 +193,7 @@ def allocate_endpoints( gpus_per_agg=gpus_per_agg, gpus_per_node=gpus_per_node, available_nodes=available_nodes, + spread_workers=spread_workers, ) def endpoints_to_processes( diff --git a/src/srtctl/backends/trtllm.py b/src/srtctl/backends/trtllm.py index b706ec16..1e3af14e 100644 --- a/src/srtctl/backends/trtllm.py +++ b/src/srtctl/backends/trtllm.py @@ -127,6 +127,7 @@ def allocate_endpoints( gpus_per_agg: int, gpus_per_node: int, available_nodes: Sequence[str], + spread_workers: bool = False, ) -> list["Endpoint"]: """Allocate endpoints to nodes.""" from srtctl.core.topology import allocate_endpoints @@ -140,6 +141,7 @@ def allocate_endpoints( gpus_per_agg=gpus_per_agg, gpus_per_node=gpus_per_node, available_nodes=available_nodes, + spread_workers=spread_workers, ) def endpoints_to_processes( diff --git a/src/srtctl/backends/vllm.py b/src/srtctl/backends/vllm.py index 1acbd50c..ef3058ae 100644 --- a/src/srtctl/backends/vllm.py +++ b/src/srtctl/backends/vllm.py @@ -63,6 +63,7 @@ class VLLMProtocol: backend: type: vllm connector: nixl # translated to --kv-transfer-config JSON + allow_prefill_decode_colocation: true # pack P/D on one node when all workers fit prefill_environment: PYTHONUNBUFFERED: "1" vllm_config: @@ -91,6 +92,11 @@ class VLLMProtocol: # dynamo 1.0.0+: translated to --kv-transfer-config (--connector was removed). connector: str | None = "nixl" + # Allow prefill and decode workers to share one node when the combined GPU + # request fits within gpus_per_node. Defaults off to preserve existing P/D + # node separation. + allow_prefill_decode_colocation: bool = False + Schema: ClassVar[builtins.type[Schema]] = Schema # ========================================================================= @@ -154,6 +160,26 @@ def get_served_model_name(self, default: str) -> str: return name return default + def should_colocate_prefill_decode( + self, + *, + num_prefill: int, + num_decode: int, + num_agg: int, + gpus_per_prefill: int, + gpus_per_decode: int, + gpus_per_agg: int, + gpus_per_node: int, + ) -> bool: + """Whether all vLLM workers should be packed onto one node.""" + if not self.allow_prefill_decode_colocation: + return False + if num_prefill <= 0 or num_decode <= 0 or gpus_per_node <= 0: + return False + + total_worker_gpus = num_prefill * gpus_per_prefill + num_decode * gpus_per_decode + num_agg * gpus_per_agg + return total_worker_gpus <= gpus_per_node + def allocate_endpoints( self, num_prefill: int, @@ -164,6 +190,7 @@ def allocate_endpoints( gpus_per_agg: int, gpus_per_node: int, available_nodes: Sequence[str], + spread_workers: bool = False, ) -> list[Endpoint]: """Allocate endpoints to nodes.""" from srtctl.core.topology import allocate_endpoints @@ -177,6 +204,16 @@ def allocate_endpoints( gpus_per_agg=gpus_per_agg, gpus_per_node=gpus_per_node, available_nodes=available_nodes, + spread_workers=spread_workers, + allow_prefill_decode_colocation=self.should_colocate_prefill_decode( + num_prefill=num_prefill, + num_decode=num_decode, + num_agg=num_agg, + gpus_per_prefill=gpus_per_prefill, + gpus_per_decode=gpus_per_decode, + gpus_per_agg=gpus_per_agg, + gpus_per_node=gpus_per_node, + ), ) def _is_dp_mode(self, mode: WorkerMode) -> bool: @@ -249,6 +286,13 @@ def endpoints_to_processes( # DP+EP mode: one process per GPU # Each process gets a single GPU and a unique dp_rank dp_rank = 0 + # Allocate a unique DP RPC port for this endpoint's leader node + dp_rpc_port = port_allocator.next_dp_rpc_port(endpoint.leader_node) + # Allocate a single NIXL base port for this endpoint. + # vLLM internally computes: actual_port = base + data_parallel_rank + # so all DP ranks in the endpoint share the same base port. + dp_size = self._get_dp_size(endpoint.mode) or len(endpoint.gpu_indices) + nixl_base_port = port_allocator.next_nixl_port_block(dp_size) for _node_rank, node in enumerate(endpoint.nodes): for gpu_idx in sorted(endpoint.gpu_indices): is_leader = dp_rank == 0 @@ -259,7 +303,7 @@ def endpoints_to_processes( else None ) kv_events_port = port_allocator.next_kv_events_port() - nixl_port = port_allocator.next_nixl_port() + nixl_port = nixl_base_port processes.append( Process( @@ -273,6 +317,7 @@ def endpoints_to_processes( bootstrap_port=bootstrap_port, kv_events_port=kv_events_port, nixl_port=nixl_port, + dp_rpc_port=dp_rpc_port, ) ) current_sys_port += 1 @@ -356,7 +401,16 @@ def build_worker_command( # DP+EP mode: each GPU runs its own process # process.node_rank is the dp_rank (set in endpoints_to_processes) dp_rank = process.node_rank - dp_rpc_port = config.pop("data-parallel-rpc-port", None) or config.pop("data_parallel_rpc_port", 13345) + # Use the per-endpoint dp_rpc_port allocated by NodePortAllocator + # (avoids port collisions when multiple endpoints share a node) + dp_rpc_port = ( + process.dp_rpc_port + or config.pop("data-parallel-rpc-port", None) + or config.pop("data_parallel_rpc_port", 13345) + ) + # Pop from config so it doesn't get added again by _config_to_cli_args + config.pop("data-parallel-rpc-port", None) + config.pop("data_parallel_rpc_port", None) cmd.extend( [ diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py index 96c90462..e0dd39c7 100644 --- a/src/srtctl/cli/do_sweep.py +++ b/src/srtctl/cli/do_sweep.py @@ -76,6 +76,7 @@ def endpoints(self) -> list[Endpoint]: gpus_per_agg=r.gpus_per_agg, gpus_per_node=r.gpus_per_node, available_nodes=self.runtime.nodes.worker, + spread_workers=r.spread_workers, ) @functools.cached_property diff --git a/src/srtctl/cli/submit.py b/src/srtctl/cli/submit.py index 21f26d9f..39325c1b 100644 --- a/src/srtctl/cli/submit.py +++ b/src/srtctl/cli/submit.py @@ -197,7 +197,7 @@ def generate_minimal_sbatch_script( env = Environment(loader=FileSystemLoader(str(template_dir))) template = env.get_template("job_script_minimal.j2") - total_nodes = config.resources.total_nodes + total_nodes = config.total_nodes # Add extra node for dedicated etcd/nats infrastructure if config.infra.etcd_nats_dedicated_node: total_nodes += 1 diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index c535be39..fd59e237 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -396,6 +396,11 @@ class ResourceConfig: agg_nodes: int | None = None agg_workers: int | None = None + # If True, place each partial-node worker on its own node instead of + # packing multiple onto the same node. Caller must reserve enough nodes + # (e.g. set decode_nodes=decode_workers when gpus_per_decode str: default = Path(self.model.path).name return self.backend.get_served_model_name(default) + @property + def total_nodes(self) -> int: + """Worker node count, adjusted for backend-specific packing.""" + if isinstance(self.backend, VLLMProtocol) and self.backend.should_colocate_prefill_decode( + num_prefill=self.resources.num_prefill, + num_decode=self.resources.num_decode, + num_agg=self.resources.num_agg, + gpus_per_prefill=self.resources.gpus_per_prefill, + gpus_per_decode=self.resources.gpus_per_decode, + gpus_per_agg=self.resources.gpus_per_agg, + gpus_per_node=self.resources.gpus_per_node, + ): + return 1 + return self.resources.total_nodes + @property def backend_type(self) -> str: """Get the backend type string.""" diff --git a/src/srtctl/core/topology.py b/src/srtctl/core/topology.py index f2a24e5d..1ec4bf24 100644 --- a/src/srtctl/core/topology.py +++ b/src/srtctl/core/topology.py @@ -35,8 +35,9 @@ class NodePortAllocator: assignments per node and hands out the next available port. Port ranges (non-overlapping): - - kv_events_port: 5550+ (global) - ZMQ port for kv-events publishing - - nixl_port: 6550+ (global) - NIXL side channel for KV transfers (vLLM) + - kv_events_port: 20000+ (global) - ZMQ port for kv-events publishing + - nixl_port: 21000+ (global) - NIXL side channel for KV transfers (vLLM) + - dp_rpc_port: 13345+ (per node) - DP coordination port (vLLM data-parallel) - http_port: 30000+ (per node) - HTTP serving port - bootstrap_port: 31000+ (per node) - P/D coordination port (prefill only) @@ -53,11 +54,13 @@ class NodePortAllocator: base_http_port: int = 30000 base_bootstrap_port: int = 31000 - base_kv_events_port: int = 5550 - base_nixl_port: int = 6550 # NIXL side channel ports (must not overlap with kv_events) + base_kv_events_port: int = 20000 + base_nixl_port: int = 21000 # NIXL side channel ports (must not overlap with kv_events) + base_dp_rpc_port: int = 13345 # DP coordination port for vLLM data-parallel _http_ports: dict[str, int] = field(default_factory=dict, repr=False) _bootstrap_ports: dict[str, int] = field(default_factory=dict, repr=False) + _dp_rpc_ports: dict[str, int] = field(default_factory=dict, repr=False) _next_kv_events_port: int = field(default=0, repr=False) # Global counter _next_nixl_port: int = field(default=0, repr=False) # Global counter for NIXL @@ -66,7 +69,7 @@ def next_http_port(self, node: str) -> int: if node not in self._http_ports: self._http_ports[node] = self.base_http_port port = self._http_ports[node] - self._http_ports[node] += 1000 + self._http_ports[node] += 1 return port def next_bootstrap_port(self, node: str) -> int: @@ -93,6 +96,32 @@ def next_nixl_port(self) -> int: self._next_nixl_port += 1 return port + def next_nixl_port_block(self, size: int) -> int: + """Reserve a block of consecutive NIXL ports, return the base port. + + Used in DP mode where vLLM computes: + actual_port = VLLM_NIXL_SIDE_CHANNEL_PORT + data_parallel_rank + All DP ranks within an endpoint share the same base port, so we + must reserve `size` ports to avoid collisions with other endpoints. + """ + if self._next_nixl_port == 0: + self._next_nixl_port = self.base_nixl_port + port = self._next_nixl_port + self._next_nixl_port += size + return port + + def next_dp_rpc_port(self, node: str) -> int: + """Get next available DP RPC port for a node. + + When multiple DP endpoints share a node, each needs a unique + data-parallel-rpc-port to avoid bind collisions. + """ + if node not in self._dp_rpc_ports: + self._dp_rpc_ports[node] = self.base_dp_rpc_port + port = self._dp_rpc_ports[node] + self._dp_rpc_ports[node] += 1 + return port + @dataclass(frozen=True) class Endpoint: @@ -167,6 +196,7 @@ class Process: bootstrap_port: int | None = None kv_events_port: int | None = None nixl_port: int | None = None + dp_rpc_port: int | None = None @property def is_leader(self) -> bool: @@ -188,6 +218,8 @@ def allocate_endpoints( gpus_per_agg: int, gpus_per_node: int, available_nodes: Sequence[str], + spread_workers: bool = False, + allow_prefill_decode_colocation: bool = False, ) -> list[Endpoint]: """Allocate endpoints to nodes based on GPU requirements. @@ -202,6 +234,11 @@ def allocate_endpoints( gpus_per_agg: GPUs per agg worker gpus_per_node: GPUs available per node available_nodes: List of available node hostnames + spread_workers: If True, place each partial-node worker on its own + node instead of packing multiple onto the same node. Requires the + caller to reserve enough nodes (one per worker per mode). + allow_prefill_decode_colocation: If True, decode workers may use + remaining GPUs on a node already used by prefill workers. Returns: List of Endpoint objects with node assignments @@ -326,7 +363,7 @@ def allocate_workers_simple(mode: WorkerMode, count: int, gpus_per_worker: int) gpu_indices = frozenset(range(gpu_offset, gpu_offset + gpus_per_worker)) gpu_offset += gpus_per_worker - if gpu_offset >= gpus_per_node: + if gpu_offset >= gpus_per_node or spread_workers: node_idx += 1 gpu_offset = 0 @@ -346,13 +383,13 @@ def allocate_workers_simple(mode: WorkerMode, count: int, gpus_per_worker: int) if num_prefill > 0: endpoints.extend(allocate_workers_simple("prefill", num_prefill, gpus_per_prefill)) - # When there's a partial allocation on the current node (gpu_offset > 0) and - # there are more nodes available, advance to ensure prefill and decode don't - # share a node. This prevents the bug where a multi-node decode worker overlaps - # with a partial-node prefill worker. + # By default, when there's a partial allocation on the current node + # (gpu_offset > 0) and there are more nodes available, advance to ensure + # prefill and decode don't share a node. This prevents the bug where a + # multi-node decode worker overlaps with a partial-node prefill worker. # When there are no more nodes (decode_nodes=0 config), allow sharing. if num_decode > 0: - if gpu_offset > 0 and (node_idx + 1) < len(available_nodes): + if not allow_prefill_decode_colocation and gpu_offset > 0 and (node_idx + 1) < len(available_nodes): node_idx += 1 gpu_offset = 0 endpoints.extend(allocate_workers_simple("decode", num_decode, gpus_per_decode)) diff --git a/tests/test_configs.py b/tests/test_configs.py index 0b4138d5..ea3a87ce 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -891,6 +891,226 @@ def test_sbatch_normal_node_count_without_dedicated_infra(self): # Should request 2 nodes: just the workers assert "#SBATCH --nodes=2" in script + def test_vllm_colocation_reduces_sbatch_to_one_node_when_fit(self): + """Test vLLM P/D colocation requests one worker node when all workers fit.""" + from pathlib import Path + + from srtctl.backends import VLLMProtocol + from srtctl.cli.submit import generate_minimal_sbatch_script + from srtctl.core.schema import InfraConfig, ModelConfig, ResourceConfig, SrtConfig + + config = SrtConfig( + name="test", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=1, + prefill_workers=1, + decode_workers=1, + _explicit_gpus_per_prefill=4, + _explicit_gpus_per_decode=4, + ), + backend=VLLMProtocol(allow_prefill_decode_colocation=True), + infra=InfraConfig(etcd_nats_dedicated_node=False), + ) + + assert config.resources.total_nodes == 2 + assert config.total_nodes == 1 + + script = generate_minimal_sbatch_script(config, Path("/tmp/test.yaml")) + + assert "#SBATCH --nodes=1" in script + + def test_vllm_colocation_keeps_normal_node_count_when_not_fit(self): + """Test vLLM P/D colocation does not reduce nodes when workers exceed one node.""" + from srtctl.backends import VLLMProtocol + from srtctl.core.schema import ModelConfig, ResourceConfig, SrtConfig + + config = SrtConfig( + name="test", + model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"), + resources=ResourceConfig( + gpu_type="h100", + gpus_per_node=8, + prefill_nodes=1, + decode_nodes=1, + prefill_workers=1, + decode_workers=1, + _explicit_gpus_per_prefill=6, + _explicit_gpus_per_decode=4, + ), + backend=VLLMProtocol(allow_prefill_decode_colocation=True), + ) + + assert config.total_nodes == 2 + + +class TestVLLMPrefillDecodeColocation: + """Tests for vLLM prefill/decode same-node packing.""" + + def test_disabled_by_default_keeps_prefill_and_decode_separate(self): + """Test vLLM preserves default P/D node separation.""" + from srtctl.backends import VLLMProtocol + + endpoints = VLLMProtocol().allocate_endpoints( + num_prefill=1, + num_decode=1, + num_agg=0, + gpus_per_prefill=4, + gpus_per_decode=4, + gpus_per_agg=0, + gpus_per_node=8, + available_nodes=("node0", "node1"), + ) + + assert endpoints[0].mode == "prefill" + assert endpoints[0].nodes == ("node0",) + assert endpoints[1].mode == "decode" + assert endpoints[1].nodes == ("node1",) + + def test_colocation_requires_prefill_decode_and_valid_node_size(self): + """Test vLLM colocation stays off for incomplete or invalid P/D topology.""" + from srtctl.backends import VLLMProtocol + + backend = VLLMProtocol(allow_prefill_decode_colocation=True) + + for num_prefill, num_decode, gpus_per_node in ((0, 1, 8), (1, 0, 8), (1, 1, 0)): + assert not backend.should_colocate_prefill_decode( + num_prefill=num_prefill, + num_decode=num_decode, + num_agg=0, + gpus_per_prefill=4, + gpus_per_decode=4, + gpus_per_agg=0, + gpus_per_node=gpus_per_node, + ) + + def test_enabled_packs_prefill_and_decode_when_one_node_fits(self): + """Test vLLM packs P/D workers together when requested and all fit.""" + from srtctl.backends import VLLMProtocol + + endpoints = VLLMProtocol(allow_prefill_decode_colocation=True).allocate_endpoints( + num_prefill=2, + num_decode=2, + num_agg=0, + gpus_per_prefill=2, + gpus_per_decode=2, + gpus_per_agg=0, + gpus_per_node=8, + available_nodes=("node0", "node1"), + ) + + prefill_eps = [ep for ep in endpoints if ep.mode == "prefill"] + decode_eps = [ep for ep in endpoints if ep.mode == "decode"] + + assert [ep.nodes for ep in prefill_eps] == [("node0",), ("node0",)] + assert [ep.gpu_indices for ep in prefill_eps] == [frozenset({0, 1}), frozenset({2, 3})] + assert [ep.nodes for ep in decode_eps] == [("node0",), ("node0",)] + assert [ep.gpu_indices for ep in decode_eps] == [frozenset({4, 5}), frozenset({6, 7})] + + def test_same_node_prefill_decode_ports_do_not_collide(self): + """Test same-node vLLM P/D workers get distinct listener ports.""" + from srtctl.backends import VLLMProtocol + + backend = VLLMProtocol(allow_prefill_decode_colocation=True) + endpoints = backend.allocate_endpoints( + num_prefill=1, + num_decode=1, + num_agg=0, + gpus_per_prefill=4, + gpus_per_decode=4, + gpus_per_agg=0, + gpus_per_node=8, + available_nodes=("node0", "node1"), + ) + + processes = backend.endpoints_to_processes(endpoints) + prefill = next(p for p in processes if p.endpoint_mode == "prefill") + decode = next(p for p in processes if p.endpoint_mode == "decode") + + assert prefill.node == decode.node == "node0" + assert prefill.http_port == 30000 + assert decode.http_port == 30001 + assert prefill.bootstrap_port == 31000 + + bound_ports = [ + port + for process in processes + for port in (process.http_port, process.bootstrap_port, process.kv_events_port, process.nixl_port) + if port + ] + assert len(bound_ports) == len(set(bound_ports)) + + def test_same_node_dp_prefill_decode_ports_do_not_collide(self): + """Test same-node DP P/D endpoints get distinct per-endpoint port ranges.""" + from srtctl.backends import VLLMProtocol, VLLMServerConfig + + backend = VLLMProtocol( + allow_prefill_decode_colocation=True, + vllm_config=VLLMServerConfig( + prefill={"data-parallel-size": 4, "enable-expert-parallel": True}, + decode={"data-parallel-size": 4, "enable-expert-parallel": True}, + ), + ) + endpoints = backend.allocate_endpoints( + num_prefill=1, + num_decode=1, + num_agg=0, + gpus_per_prefill=4, + gpus_per_decode=4, + gpus_per_agg=0, + gpus_per_node=8, + available_nodes=("node0", "node1"), + ) + + processes = backend.endpoints_to_processes(endpoints) + prefill = [p for p in processes if p.endpoint_mode == "prefill"] + decode = [p for p in processes if p.endpoint_mode == "decode"] + + assert len(prefill) == 4 + assert len(decode) == 4 + assert {p.node for p in prefill + decode} == {"node0"} + assert {p.dp_rpc_port for p in prefill} == {13345} + assert {p.dp_rpc_port for p in decode} == {13346} + assert {p.nixl_port for p in prefill} == {21000} + assert {p.nixl_port for p in decode} == {21004} + + leader_ports = [ + port + for process in prefill + decode + for port in (process.http_port, process.bootstrap_port) + if port + ] + assert sorted(leader_ports) == [30000, 30001, 31000] + + prefill_actual_nixl_ports = {next(iter(p.nixl_port for p in prefill)) + p.node_rank for p in prefill} + decode_actual_nixl_ports = {next(iter(p.nixl_port for p in decode)) + p.node_rank for p in decode} + assert prefill_actual_nixl_ports == {21000, 21001, 21002, 21003} + assert decode_actual_nixl_ports == {21004, 21005, 21006, 21007} + assert prefill_actual_nixl_ports.isdisjoint(decode_actual_nixl_ports) + + def test_enabled_does_not_pack_when_one_node_does_not_fit(self): + """Test vLLM falls back to separated P/D nodes when total GPUs do not fit.""" + from srtctl.backends import VLLMProtocol + + endpoints = VLLMProtocol(allow_prefill_decode_colocation=True).allocate_endpoints( + num_prefill=1, + num_decode=1, + num_agg=0, + gpus_per_prefill=6, + gpus_per_decode=4, + gpus_per_agg=0, + gpus_per_node=8, + available_nodes=("node0", "node1"), + ) + + assert endpoints[0].mode == "prefill" + assert endpoints[0].nodes == ("node0",) + assert endpoints[1].mode == "decode" + assert endpoints[1].nodes == ("node1",) + class TestVLLMDataParallelMode: """Tests for vLLM DP+EP (Data Parallel + Expert Parallel) mode.""" @@ -965,6 +1185,46 @@ def test_dp_mode_creates_per_gpu_processes(self): dp_ranks = [p.node_rank for p in processes] assert dp_ranks == list(range(16)) + def test_dp_mode_allocates_unique_ports_for_multiple_endpoints_per_node(self): + """Test DP endpoints sharing a node get non-colliding coordination ports.""" + from srtctl.backends import VLLMProtocol, VLLMServerConfig + from srtctl.core.topology import Endpoint + + backend = VLLMProtocol( + vllm_config=VLLMServerConfig( + decode={"data-parallel-size": 4, "enable-expert-parallel": True}, + ) + ) + + endpoints = [ + Endpoint( + mode="decode", + index=0, + nodes=("node0",), + gpu_indices=frozenset(range(4)), + gpus_per_node=8, + ), + Endpoint( + mode="decode", + index=1, + nodes=("node0",), + gpu_indices=frozenset(range(4, 8)), + gpus_per_node=8, + ), + ] + + processes = backend.endpoints_to_processes(endpoints) + + first_endpoint = [p for p in processes if p.endpoint_index == 0] + second_endpoint = [p for p in processes if p.endpoint_index == 1] + + assert {p.dp_rpc_port for p in first_endpoint} == {13345} + assert {p.dp_rpc_port for p in second_endpoint} == {13346} + assert {p.nixl_port for p in first_endpoint} == {21000} + assert {p.nixl_port for p in second_endpoint} == {21004} + assert [p.node_rank for p in first_endpoint] == list(range(4)) + assert [p.node_rank for p in second_endpoint] == list(range(4)) + def test_dp_mode_command_includes_dp_flags(self): """Test that DP mode command includes correct DP flags instead of TP flags.""" from pathlib import Path diff --git a/tests/test_endpoint_allocation.py b/tests/test_endpoint_allocation.py index 1625e6b7..6674ae87 100644 --- a/tests/test_endpoint_allocation.py +++ b/tests/test_endpoint_allocation.py @@ -137,6 +137,45 @@ def test_aggregated_mode(self): assert ep.mode == "agg" assert ep.total_gpus == 4 + def test_spread_workers_partial_node(self): + """spread_workers=True forces each partial-node worker onto its own node.""" + endpoints = allocate_endpoints( + num_prefill=1, + num_decode=2, + num_agg=0, + gpus_per_prefill=1, + gpus_per_decode=2, + gpus_per_agg=0, + gpus_per_node=4, + available_nodes=("node0", "node1", "node2"), + spread_workers=True, + ) + + decode_eps = [e for e in endpoints if e.mode == "decode"] + assert len(decode_eps) == 2 + # Without spread_workers both decode workers would land on node1. + assert decode_eps[0].nodes == ("node1",) + assert decode_eps[1].nodes == ("node2",) + assert decode_eps[0].gpu_indices == frozenset({0, 1}) + assert decode_eps[1].gpu_indices == frozenset({0, 1}) + + def test_spread_workers_default_packs(self): + """spread_workers=False (default) packs partial-node workers onto the same node.""" + endpoints = allocate_endpoints( + num_prefill=0, + num_decode=2, + num_agg=0, + gpus_per_prefill=0, + gpus_per_decode=2, + gpus_per_agg=0, + gpus_per_node=4, + available_nodes=("node0", "node1"), + ) + + decode_eps = [e for e in endpoints if e.mode == "decode"] + assert decode_eps[0].nodes == ("node0",) + assert decode_eps[1].nodes == ("node0",) + def test_prefill_decode_never_share_node_partial_allocation(self): """Test that prefill and decode workers are never colocated on the same node. @@ -317,9 +356,9 @@ def test_kv_events_port_allocation(self): assert all(port is not None for port in kv_ports), "All processes should have kv_events_port" assert len(kv_ports) == len(set(kv_ports)), "All kv_events_ports should be globally unique" - # Ports should be sequential starting from 5550 + # Ports should be sequential starting from 20000 # With 2 prefill + 2 decode workers, each on single node = 4 processes = 4 ports - assert sorted(kv_ports) == [5550, 5551, 5552, 5553] + assert sorted(kv_ports) == [20000, 20001, 20002, 20003] def test_kv_events_port_same_node_unique(self): """Test kv_events_port is unique even when workers share a node.""" @@ -341,11 +380,11 @@ def test_kv_events_port_same_node_unique(self): assert len(processes) == 2 assert processes[0].node == processes[1].node == "node0" assert processes[0].kv_events_port != processes[1].kv_events_port - assert processes[0].kv_events_port == 5550 - assert processes[1].kv_events_port == 5551 + assert processes[0].kv_events_port == 20000 + assert processes[1].kv_events_port == 20001 def test_nixl_port_allocation(self): - """Test NIXL ports are allocated globally unique starting at 6550.""" + """Test NIXL ports are allocated globally unique starting at 21000.""" from srtctl.core.topology import Endpoint endpoints = [ @@ -371,5 +410,5 @@ def test_nixl_port_allocation(self): nixl_ports = [p.nixl_port for p in processes] assert all(port is not None for port in nixl_ports), "All processes should have nixl_port" assert len(nixl_ports) == len(set(nixl_ports)) # All unique - assert min(nixl_ports) == 6550 # Starts at base - assert nixl_ports == [6550, 6551] # Sequential + assert min(nixl_ports) == 21000 # Starts at base + assert nixl_ports == [21000, 21001] # Sequential