From a09988a45731099ffb5436d753c806529cfa3f74 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Thu, 19 Feb 2026 17:54:24 -0800 Subject: [PATCH 01/15] Change from linear to exponentially decay cudagraph sizes --- .../core/inference/batch_dimensions_utils.py | 48 +++++++++---------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 77354d59320..240844a44d1 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -210,7 +210,7 @@ class CUDAGraphBatchDimensionBuilder: """ # Constant for rounding token counts when generating CUDA graph batch dimensions - CUDA_GRAPH_ROUNDER = 8 + CUDA_GRAPH_ROUNDER = 2 @staticmethod def _calculate_cuda_graph_token_counts( @@ -219,8 +219,9 @@ def _calculate_cuda_graph_token_counts( """ Calculate CUDA graph token counts for a given configuration. - This method computes evenly-spaced token counts from step_size up to - cuda_graph_max_tokens, ensuring proper rounding and TP alignment. + This method computes exponentially-decreasing token counts (powers of 2) + from cuda_graph_max_tokens down to CUDA_GRAPH_ROUNDER, ensuring proper + rounding and TP alignment. Args: tp_size: Tensor parallel size (for alignment) @@ -232,38 +233,35 @@ def _calculate_cuda_graph_token_counts( Example: >>> _calculate_cuda_graph_token_counts - (tp_size=2, num_cuda_graphs=4, cuda_graph_max_tokens=1000) - [1000, 752, 504, 256] + (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128) + [128, 64, 32, 16, 8, 4, 2] """ assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" assert ( cuda_graph_max_tokens > 0 ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}" - # Cuda graph step size. - cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs - cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int( - math.ceil(int(cuda_graph_step_size) / CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER) - ) - # Make sure divisible by TP size - cuda_graph_step_size = math.ceil(cuda_graph_step_size / tp_size) * tp_size + rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER - # round down cuda graph max tokens to be multiple of TP size + # Round down cuda graph max tokens to be multiple of TP size cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size - # Cuda graph token counts. if num_cuda_graphs == 1: - cuda_graph_token_counts = [cuda_graph_max_tokens] - else: - cuda_graph_token_counts = list( - range(cuda_graph_step_size, cuda_graph_max_tokens, cuda_graph_step_size) - ) - if ( - len(cuda_graph_token_counts) == 0 - or cuda_graph_token_counts[-1] != cuda_graph_max_tokens - ): - cuda_graph_token_counts.append(cuda_graph_max_tokens) - cuda_graph_token_counts.reverse() + return [cuda_graph_max_tokens] + + # Exponentially decreasing, stops after num_cuda_graphs entries + # or when below the minimum size. + cuda_graph_token_counts = [] + val = cuda_graph_max_tokens + for _ in range(num_cuda_graphs): + # Round down to multiple of rounder, then up to multiple of TP size + rounded = max(rounder, (val // rounder) * rounder) + rounded = math.ceil(rounded / tp_size) * tp_size + if rounded not in cuda_graph_token_counts: + cuda_graph_token_counts.append(rounded) + val //= 2 + if val < rounder: + break return cuda_graph_token_counts From 20798afd428dd39e5c9ec3f3e2e13aacee8c5211 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Thu, 19 Feb 2026 18:14:24 -0800 Subject: [PATCH 02/15] Maybe include a size-1 graph --- megatron/core/inference/batch_dimensions_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 240844a44d1..1c2a56c1692 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -263,6 +263,11 @@ def _calculate_cuda_graph_token_counts( if val < rounder: break + # Include a (possibly extra) size-1 graph + min_token_count = math.ceil(1 / tp_size) * tp_size + if cuda_graph_token_counts[-1] != min_token_count: + cuda_graph_token_counts.append(min_token_count) + return cuda_graph_token_counts @staticmethod From 3c718e9ee5f9c2d5efc6604118b7107d8baafe96 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Fri, 20 Feb 2026 10:11:16 -0800 Subject: [PATCH 03/15] Update test_cuda_graph_token_counts --- .../inference/engines/test_dynamic_engine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index d71ccccd49a..f84c74f5a31 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -694,11 +694,11 @@ def test_cuda_graph_token_counts(self) -> None: for num_cuda_graphs, expected_cuda_graph_token_counts in [ (0, [80]), (1, [80]), - (2, [80, 40]), - (4, [80, 72, 48, 24]), - (8, [80, 64, 48, 32, 16]), - (16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), - (32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), + (2, [80, 40, 1]), + (4, [80, 40, 20, 10, 1]), + (8, [80, 40, 20, 10, 4, 2, 1]), + (16, [80, 40, 20, 10, 4, 2, 1]), + (32, [80, 40, 20, 10, 4, 2, 1]), ]: # Build cuda graphs (inside dynamic engine). From cafe6af237b55c59ea245eb7f25604f4a1a01092 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 23 Feb 2026 08:13:31 -0800 Subject: [PATCH 04/15] address comments --- .../core/inference/batch_dimensions_utils.py | 16 ++++++-- .../inference/engines/test_dynamic_engine.py | 38 ++++++++++++++++++- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 1c2a56c1692..fd00338225d 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -234,7 +234,7 @@ def _calculate_cuda_graph_token_counts( Example: >>> _calculate_cuda_graph_token_counts (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128) - [128, 64, 32, 16, 8, 4, 2] + [128, 64, 32, 16, 8, 4, 2, 1] """ assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" assert ( @@ -251,6 +251,7 @@ def _calculate_cuda_graph_token_counts( # Exponentially decreasing, stops after num_cuda_graphs entries # or when below the minimum size. + # TODO(helenn/lmcafee): Extend upper range of distribution to be linearly-spaced. cuda_graph_token_counts = [] val = cuda_graph_max_tokens for _ in range(num_cuda_graphs): @@ -263,10 +264,17 @@ def _calculate_cuda_graph_token_counts( if val < rounder: break + # Ensure cuda_graph_max_tokens is always included + if cuda_graph_token_counts[0] != cuda_graph_max_tokens: + cuda_graph_token_counts.insert(0, cuda_graph_max_tokens) + # Include a (possibly extra) size-1 graph - min_token_count = math.ceil(1 / tp_size) * tp_size - if cuda_graph_token_counts[-1] != min_token_count: - cuda_graph_token_counts.append(min_token_count) + if cuda_graph_token_counts[-1] != tp_size: + cuda_graph_token_counts.append(tp_size) + + # Trim from the middle if we exceed num_cuda_graphs requested by the user + while len(cuda_graph_token_counts) > num_cuda_graphs: + cuda_graph_token_counts.pop(-2) return cuda_graph_token_counts diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index f84c74f5a31..524b7efa8df 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -15,6 +15,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state +from megatron.core.inference.batch_dimensions_utils import CUDAGraphBatchDimensionBuilder from megatron.core.inference.config import ( InferenceConfig, KVCacheManagementMode, @@ -694,8 +695,8 @@ def test_cuda_graph_token_counts(self) -> None: for num_cuda_graphs, expected_cuda_graph_token_counts in [ (0, [80]), (1, [80]), - (2, [80, 40, 1]), - (4, [80, 40, 20, 10, 1]), + (2, [80, 1]), + (4, [80, 40, 20, 1]), (8, [80, 40, 20, 10, 4, 2, 1]), (16, [80, 40, 20, 10, 4, 2, 1]), (32, [80, 40, 20, 10, 4, 2, 1]), @@ -716,6 +717,39 @@ def test_cuda_graph_token_counts(self) -> None: actual_cuda_graph_token_counts, ) + @pytest.mark.internal + @pytest.mark.parametrize( + "tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected", + [ + # TP=1 + (1, 1, 80, [80]), + (1, 2, 80, [80, 1]), + (1, 4, 80, [80, 40, 20, 1]), + (1, 8, 80, [80, 40, 20, 10, 4, 2, 1]), + (1, 16, 80, [80, 40, 20, 10, 4, 2, 1]), + # TP=2 + (2, 1, 80, [80]), + (2, 2, 80, [80, 2]), + (2, 4, 80, [80, 40, 20, 2]), + (2, 8, 80, [80, 40, 20, 10, 4, 2]), + (2, 16, 80, [80, 40, 20, 10, 4, 2]), + ], + ) + def test_calculate_cuda_graph_token_counts( + self, tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected + ): + """Test _calculate_cuda_graph_token_counts for various TP sizes.""" + actual = CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts( + tp_size=tp_size, + num_cuda_graphs=num_cuda_graphs, + cuda_graph_max_tokens=cuda_graph_max_tokens, + ) + assert actual == expected, ( + f"tp_size={tp_size}, num_cuda_graphs={num_cuda_graphs}, " + f"cuda_graph_max_tokens={cuda_graph_max_tokens}: " + f"expected {expected}, got {actual}" + ) + @pytest.mark.internal @pytest.mark.skipif( not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" From 96f527833565ce08ab7c191ac6b97d23ba0945fb Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 23 Feb 2026 12:24:08 -0800 Subject: [PATCH 05/15] keshav comments --- megatron/core/inference/batch_dimensions_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index fd00338225d..1bc1475e6bf 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -273,9 +273,13 @@ def _calculate_cuda_graph_token_counts( cuda_graph_token_counts.append(tp_size) # Trim from the middle if we exceed num_cuda_graphs requested by the user + # Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements. while len(cuda_graph_token_counts) > num_cuda_graphs: cuda_graph_token_counts.pop(-2) + assert len(cuda_graph_token_counts) == num_cuda_graphs + assert cuda_graph_max_tokens in cuda_graph_token_counts + return cuda_graph_token_counts @staticmethod From ad6753e544288ed002538b73be736f0240bf3085 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Tue, 10 Mar 2026 08:14:10 -0700 Subject: [PATCH 06/15] address comments --- megatron/core/inference/batch_dimensions_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 1bc1475e6bf..020c7bbf62e 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -261,7 +261,7 @@ def _calculate_cuda_graph_token_counts( if rounded not in cuda_graph_token_counts: cuda_graph_token_counts.append(rounded) val //= 2 - if val < rounder: + if val < 1: break # Ensure cuda_graph_max_tokens is always included @@ -277,7 +277,7 @@ def _calculate_cuda_graph_token_counts( while len(cuda_graph_token_counts) > num_cuda_graphs: cuda_graph_token_counts.pop(-2) - assert len(cuda_graph_token_counts) == num_cuda_graphs + assert len(cuda_graph_token_counts) <= num_cuda_graphs assert cuda_graph_max_tokens in cuda_graph_token_counts return cuda_graph_token_counts From 6ea8abc34125f7bd47d744a4288afc29ae0203e4 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 08:04:56 -0700 Subject: [PATCH 07/15] update print in example script to differentiate reserved / allocated --- examples/inference/gpt/gpt_dynamic_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 02a257c1b46..9172d137eab 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -508,7 +508,7 @@ def escape_str(s): print( f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", f"total time: {total_time:.3f}s … " - f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + f"mem {peak_alloc_gb:.1f} allocated/{peak_resvd_gb:.1f} reserved GB … " f"steps: {engine.context.step_count:d} … " f"capture {capture_str}", ) From c06c3274c0c2d0e6e04c733652cbde0b2aad4f98 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 08:11:28 -0700 Subject: [PATCH 08/15] exponential decay of graph size --- .../core/inference/batch_dimensions_utils.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 347995044c9..4d5e09b41bf 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -240,31 +240,22 @@ def _calculate_cuda_graph_token_counts( [128, 64, 32, 16, 8, 4, 2, 1] """ if num_cuda_graphs == -1: - # automatically determine the number of CUDA graphs to - # capture based on the `max_requests` value - cuda_graph_token_counts = ( - [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16)) + # Each step in the exponential-decay loop below halves the cudagraph size, so we need + # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim. + auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2) + return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts( + tp_size=tp_size, + num_cuda_graphs=auto_n, + cuda_graph_max_tokens=cuda_graph_max_tokens, ) - # Align each entry to TP size - cuda_graph_token_counts = list( - dict.fromkeys( - round_up_to_nearest_multiple(s, tp_size) for s in cuda_graph_token_counts - ) - ) - # Clamp to max tokens - cuda_graph_token_counts = [ - s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens - ] - if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens: - cuda_graph_token_counts.append(cuda_graph_max_tokens) - cuda_graph_token_counts.reverse() - return cuda_graph_token_counts assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" assert ( cuda_graph_max_tokens > 0 ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}" + rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER + # Cuda graph step size. cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int( @@ -423,6 +414,23 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int ) ) + # Include the smallest decode-only graphs when auto-sizing (num_cuda_graphs == -1). + # Without this, TP alignment (size 1 -> tp_size) and the num_speculative_tokens floor + # division may drop the size 1 and size 2 graph sizes. + if num_cuda_graphs == -1: + spec_unit = num_speculative_tokens + 1 + min_decode_tokens = math.lcm(spec_unit, tp_size) + for req_count_multiple in (1, 2): + floor_tokens = min_decode_tokens * req_count_multiple + if ( + floor_tokens <= cuda_graph_max_tokens_decode + and floor_tokens not in cuda_graph_decode_token_counts + ): + cuda_graph_decode_token_counts.append(floor_tokens) + cuda_graph_decode_token_counts = sorted( + set(cuda_graph_decode_token_counts), reverse=True + ) + cuda_graph_batch_dimensions_list = [] if num_cuda_graphs is None: cuda_graph_batch_dimensions_list = [] From b508d08a90e2d96a2678c323b5eedfcf3dc31278 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 10:10:30 -0700 Subject: [PATCH 09/15] better logging for checking pool reuse etc --- .../core/inference/batch_dimensions_utils.py | 57 +++++++++------ .../core/inference/engines/dynamic_engine.py | 73 +++++++++++++++++-- .../inference/engines/test_dynamic_engine.py | 59 ++++----------- 3 files changed, 116 insertions(+), 73 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 4d5e09b41bf..1ecdfa87d07 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -244,9 +244,7 @@ def _calculate_cuda_graph_token_counts( # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim. auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2) return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts( - tp_size=tp_size, - num_cuda_graphs=auto_n, - cuda_graph_max_tokens=cuda_graph_max_tokens, + tp_size=tp_size, num_cuda_graphs=auto_n, cuda_graph_max_tokens=cuda_graph_max_tokens ) assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" @@ -274,7 +272,6 @@ def _calculate_cuda_graph_token_counts( # Exponentially decreasing, stops after num_cuda_graphs entries # or when below the minimum size. - # TODO(helenn/lmcafee): Extend upper range of distribution to be linearly-spaced. cuda_graph_token_counts = [] val = cuda_graph_max_tokens for _ in range(num_cuda_graphs): @@ -448,29 +445,45 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int token_count=token_count, prefill_req_count=0, decode_req_count=decode_req_count ) else: - # Mixed prefill and decode mode + # Mixed prefill and decode mode. + # + # Generate mixed CGs across a geometric P-grid rather than a single fixed P + # value. A captured graph at P=k bakes in a token layout for k prefill slots; + # a real batch with P != k pays for unused slot padding, which can overshoot + # the captured token_count and make the graph unusable. Geometric spacing + # bounds the relative overhead per real batch (worst case ~2x P slack) while + # keeping the total CG count log-bounded. + # + # `cuda_graph_mixed_prefill_request_count` is now used only as an on/off + # toggle for mixed CGs (>0 enables, <=0 routes to decode-only above). The + # P value it used to specify is superseded by the grid. + p_values = [] + p = 1 + while p < max_requests: + p_values.append(p) + p *= 2 + if not p_values or p_values[-1] != max_requests: + p_values.append(max_requests) + # Create prefill and mixed dimensions with full token counts for size in cuda_graph_prefill_token_counts: assert size % tp_size == 0 - prefill_req_count = min(cuda_graph_mixed_prefill_request_count, max_requests) - decode_req_count = max( - 0, - min( - (size - prefill_req_count) // (num_speculative_tokens + 1), - max_requests - prefill_req_count, - ), - ) - add_if_valid( - token_count=size, - prefill_req_count=prefill_req_count, - decode_req_count=decode_req_count, - ) + for prefill_req_count in p_values: + decode_req_count = max( + 0, + min( + (size - prefill_req_count) // (num_speculative_tokens + 1), + max_requests - prefill_req_count, + ), + ) + add_if_valid( + token_count=size, + prefill_req_count=prefill_req_count, + decode_req_count=decode_req_count, + ) # We need to ensure the prefill requests are shorter than the max sequence length, # considering the one decode token is used for prefill request construction - prefill_only_minimal_num = max( - cuda_graph_mixed_prefill_request_count, - math.ceil(size / max(1, max_sequence_length - 1)), - ) + prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1))) if prefill_only_minimal_num < max_requests: add_if_valid( token_count=size, diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 39aa21d02e7..992207fca2d 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -44,7 +44,7 @@ ) from megatron.core.inference.utils import Counter, InferenceMode, await_process_call from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.cuda_graphs import delete_cuda_graphs +from megatron.core.transformer.cuda_graphs import CudaGraphManager, delete_cuda_graphs from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction from megatron.core.utils import ( @@ -133,6 +133,8 @@ class EngineSuspendedError(Exception): def format_mem_bytes(mem_bytes): """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes.""" + if mem_bytes < 0: + return "-" + format_mem_bytes(-mem_bytes) for power, suffix in [(4, "tb"), (3, "gb"), (2, "mb"), (1, "kb"), (0, "bytes")]: suffix_bytes = 1024**power if mem_bytes >= suffix_bytes: @@ -140,6 +142,33 @@ def format_mem_bytes(mem_bytes): return "%d bytes" % mem_bytes +def _cuda_graph_mempool_bytes(): + """Return (reserved, allocated) bytes belonging to the global CUDA graph mempool. + + PyTorch's `torch.cuda.memory_stats()` reports process-wide totals that mix in + every other allocation (KV cache, NCCL workspaces, layer scratch). To isolate + growth caused by graph capture, we walk `torch.cuda.memory_snapshot()` and + filter segments by their `segment_pool_id` against the graph pool handle. + Returns (0, 0) if the pool hasn't been created yet. + """ + pool_id = CudaGraphManager.global_mempool + if pool_id is None: + return 0, 0 + reserved = 0 + allocated = 0 + for seg in torch.cuda.memory_snapshot(): + seg_pool_id = ( + seg.get("segment_pool_id") + or seg.get("private_pool_id") + or seg.get("pool_id") + or seg.get("pool") + ) + if seg_pool_id == pool_id: + reserved += seg.get("total_size", 0) + allocated += seg.get("allocated_size", 0) + return reserved, allocated + + @dataclass(kw_only=True) class RequestEntry: """Entry in the engine's `self.requests` dict.""" @@ -350,7 +379,15 @@ def create_cuda_graphs(self, reset_context: bool = True): controller = self.controller time_start = time.time() + torch.cuda.reset_peak_memory_stats() mem_stats_start = torch.cuda.memory_stats() + # Snapshot of process-wide stats for the "total memory used by capture" + # summary printed at the end of the loop. + start_proc_reserved = mem_stats_start["reserved_bytes.all.current"] + start_proc_alloc = mem_stats_start["allocated_bytes.all.current"] + # Pool-scoped baselines for the per-iteration deltas. These isolate + # actual CUDA-graph-mempool growth from unrelated scratch churn. + prev_pool_reserved, prev_pool_alloc = _cuda_graph_mempool_bytes() logging.info("> dynamic_engine.py: building cuda graphs for ") for graph in context.cuda_graph_batch_dimensions_list: @@ -432,27 +469,49 @@ def create_cuda_graphs(self, reset_context: bool = True): context.reset() + # Per-iteration memory accounting, scoped to the CUDA-graph mempool. + # This isolates pool growth from process-wide scratch churn (KV cache, + # NCCL workspaces, etc.) that pollutes `torch.cuda.memory_stats()`. + pool_reserved, pool_alloc = _cuda_graph_mempool_bytes() + logging.info( + " [graph %d/%d] %s | pool reserved=%s (Δiter=%s) " + "pool allocated=%s (Δiter=%s)", + tbar_idx + 1, + len(context.cuda_graph_batch_dimensions_list), + cuda_graph_batch_dimension, + format_mem_bytes(pool_reserved), + format_mem_bytes(pool_reserved - prev_pool_reserved), + format_mem_bytes(pool_alloc), + format_mem_bytes(pool_alloc - prev_pool_alloc), + ) + prev_pool_reserved, prev_pool_alloc = pool_reserved, pool_alloc + torch.cuda.reset_peak_memory_stats() + if mtp_warmup_enabled and mtp_seen_batch_sizes: logging.info("> MTP CUDA graph warmup: %d batch size(s)", len(mtp_seen_batch_sizes)) # Memory usage. time_end = time.time() mem_stats_end = torch.cuda.memory_stats() + final_pool_reserved, final_pool_alloc = _cuda_graph_mempool_bytes() capture_stats = { "time": time_end - time_start, "allocated_bytes": ( - mem_stats_end["allocated_bytes.all.current"] - - mem_stats_start["allocated_bytes.all.current"] + mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc ), "reserved_bytes": ( - mem_stats_end["reserved_bytes.all.current"] - - mem_stats_start["reserved_bytes.all.current"] + mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved ), + "pool_reserved_bytes": final_pool_reserved, + "pool_allocated_bytes": final_pool_alloc, } logging.info( - "> built cuda graph(s) in %.2f sec, with total memory usage: " - "allocated %s, reserved %s.", + "> built cuda graph(s) in %.2f sec. " + "Mempool: reserved %s, allocated %s. " + "Process-wide delta: allocated %s, reserved %s.", capture_stats["time"], + format_mem_bytes(capture_stats["pool_reserved_bytes"]), + format_mem_bytes(capture_stats["pool_allocated_bytes"]), format_mem_bytes(capture_stats["allocated_bytes"]), format_mem_bytes(capture_stats["reserved_bytes"]), ) diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index e549b72e78f..161c0eed509 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -17,7 +17,6 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.batch_dimensions_utils import CUDAGraphBatchDimensionBuilder from megatron.core.inference.config import ( InferenceConfig, KVCacheManagementMode, @@ -794,23 +793,28 @@ def test_fixed_output_lengths(self, model_provider: str) -> None: def test_cuda_graph_token_counts(self, use_non_decode: bool) -> None: """Test initialization of `cuda_graph_token_counts` in dynamic context.""" + # Exponential-decay graph distribution (halve from max down to tp_size). + # decode-only path: cuda_graph_max_tokens = max_requests * (spec+1) = 80. + # non-decode path: cuda_graph_max_tokens = self.max_tokens (DEFAULT 16384); + # most large prefill sizes are filtered by is_valid because + # token_count > prefill_req_count * (max_sequence_length - 1). decode_only_cases = [ (0, [80]), (1, [80]), - (2, [80, 40]), - (4, [80, 72, 48, 24]), - (8, [80, 64, 48, 32, 16]), - (16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), - (32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), + (2, [80, 1]), + (4, [80, 40, 20, 1]), + (8, [80, 40, 20, 10, 4, 2, 1]), + (16, [80, 40, 20, 10, 4, 2, 1]), + (32, [80, 40, 20, 10, 4, 2, 1]), ] non_decode_cases = [ (0, [80]), (1, [80]), - (2, [80, 40]), - (4, [80, 72, 48, 24]), - (8, [80, 64, 48, 32, 16]), - (16, [1024, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), - (32, [1024, 512, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8]), + (2, [80, 1]), + (4, [80, 40, 20, 1]), + (8, [1024, 512, 256, 80, 40, 20, 10, 4, 2, 1]), + (16, [1024, 512, 256, 128, 80, 64, 40, 32, 20, 16, 10, 8, 4, 2, 1]), + (32, [1024, 512, 256, 128, 80, 64, 40, 32, 20, 16, 10, 8, 4, 2, 1]), ] cases = non_decode_cases if use_non_decode else decode_only_cases @@ -837,39 +841,6 @@ def test_cuda_graph_token_counts(self, use_non_decode: bool) -> None: ) ) - @pytest.mark.internal - @pytest.mark.parametrize( - "tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected", - [ - # TP=1 - (1, 1, 80, [80]), - (1, 2, 80, [80, 1]), - (1, 4, 80, [80, 40, 20, 1]), - (1, 8, 80, [80, 40, 20, 10, 4, 2, 1]), - (1, 16, 80, [80, 40, 20, 10, 4, 2, 1]), - # TP=2 - (2, 1, 80, [80]), - (2, 2, 80, [80, 2]), - (2, 4, 80, [80, 40, 20, 2]), - (2, 8, 80, [80, 40, 20, 10, 4, 2]), - (2, 16, 80, [80, 40, 20, 10, 4, 2]), - ], - ) - def test_calculate_cuda_graph_token_counts( - self, tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected - ): - """Test _calculate_cuda_graph_token_counts for various TP sizes.""" - actual = CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts( - tp_size=tp_size, - num_cuda_graphs=num_cuda_graphs, - cuda_graph_max_tokens=cuda_graph_max_tokens, - ) - assert actual == expected, ( - f"tp_size={tp_size}, num_cuda_graphs={num_cuda_graphs}, " - f"cuda_graph_max_tokens={cuda_graph_max_tokens}: " - f"expected {expected}, got {actual}" - ) - @pytest.mark.internal @pytest.mark.skipif( not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" From 48cda16c17adcb5cce5f8d29e985de0905a5348e Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 10:34:31 -0700 Subject: [PATCH 10/15] fix import --- megatron/core/inference/engines/dynamic_engine.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 36064523c07..f520cc660d3 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -44,7 +44,7 @@ ) from megatron.core.inference.utils import Counter, InferenceMode, await_process_call from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.cuda_graphs import delete_cuda_graphs +from megatron.core.transformer.cuda_graphs import CudaGraphManager, delete_cuda_graphs from megatron.core.transformer.enums import InferenceCudaGraphScope from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction from megatron.core.utils import ( @@ -469,8 +469,7 @@ def create_cuda_graphs(self, reset_context: bool = True): # NCCL workspaces, etc.) that pollutes `torch.cuda.memory_stats()`. pool_reserved, pool_alloc = _cuda_graph_mempool_bytes() logging.info( - " [graph %d/%d] %s | pool reserved=%s (Δiter=%s) " - "pool allocated=%s (Δiter=%s)", + " [graph %d/%d] %s | pool reserved=%s (Δiter=%s) " "pool allocated=%s (Δiter=%s)", tbar_idx + 1, len(context.cuda_graph_batch_dimensions_list), cuda_graph_batch_dimension, @@ -491,12 +490,8 @@ def create_cuda_graphs(self, reset_context: bool = True): final_pool_reserved, final_pool_alloc = _cuda_graph_mempool_bytes() capture_stats = { "time": time_end - time_start, - "allocated_bytes": ( - mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc - ), - "reserved_bytes": ( - mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved - ), + "allocated_bytes": (mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc), + "reserved_bytes": (mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved), "pool_reserved_bytes": final_pool_reserved, "pool_allocated_bytes": final_pool_alloc, } From 9488712074c86b9f4ba394f5155626b6a19cc698 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 10:54:24 -0700 Subject: [PATCH 11/15] minor cleanup --- megatron/core/inference/engines/dynamic_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index f520cc660d3..8d955c0b001 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -376,12 +376,12 @@ def create_cuda_graphs(self, reset_context: bool = True): time_start = time.time() torch.cuda.reset_peak_memory_stats() mem_stats_start = torch.cuda.memory_stats() - # Snapshot of process-wide stats for the "total memory used by capture" - # summary printed at the end of the loop. + + # Snapshot of process-wide stats for the "total memory used by capture" summary. start_proc_reserved = mem_stats_start["reserved_bytes.all.current"] start_proc_alloc = mem_stats_start["allocated_bytes.all.current"] - # Pool-scoped baselines for the per-iteration deltas. These isolate - # actual CUDA-graph-mempool growth from unrelated scratch churn. + + # Pool-scoped baselines for the per-iteration deltas. prev_pool_reserved, prev_pool_alloc = _cuda_graph_mempool_bytes() logging.info("> dynamic_engine.py: building cuda graphs for ") From 661179aa518b72f688100a082aac0a8a7013f274 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 12:35:34 -0700 Subject: [PATCH 12/15] fix failing test --- .../inference/contexts/test_dynamic_context.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index 6a4838f2c46..978460f2ca9 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -1602,10 +1602,16 @@ def test_add_dummy_requests_for_expert_parallel_step_matches_slow_path( num_speculative_tokens=num_speculative_tokens, ) - smallest = min(ctx.cuda_graph_batch_dimensions_list) + # The fast path is decode-only by construction, so pick the smallest decode-only batch_dim. + # With the geometric grid for mixed cudagraphs, the global min may now be a P=1 mixed shape + # when num_speculative_tokens > 0 makes decode-only token_count > 1) + smallest = min( + batchdim + for batchdim in ctx.cuda_graph_batch_dimensions_list + if batchdim.prefill_req_count == 0 + ) N = smallest.decode_req_count T = smallest.token_count # N * (num_speculative_tokens + 1) - assert smallest.prefill_req_count == 0, "smallest graph must be decode-only" # --- slow path (reference) --- ctx.add_dummy_requests_for_cudagraph_capture(smallest) From b3472940d7050f25b739630dea6ab5d6f53e5a2d Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 13:02:56 -0700 Subject: [PATCH 13/15] fix test --- .../core/inference/batch_dimensions_utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 1ecdfa87d07..e3e8264b3b0 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -476,6 +476,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int max_requests - prefill_req_count, ), ) + # Skip pure-prefill shapes where each prefill request has <= 1 token. The model + # has no prompt to attend over so the graph isn't useful and it triggers a + # vectorized_gather_kernel OOB at capture time when token_count == 1. + if decode_req_count == 0 and size <= prefill_req_count: + continue add_if_valid( token_count=size, prefill_req_count=prefill_req_count, @@ -485,11 +490,14 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int # considering the one decode token is used for prefill request construction prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1))) if prefill_only_minimal_num < max_requests: - add_if_valid( - token_count=size, - prefill_req_count=max(prefill_only_minimal_num, min(max_requests, size)), - decode_req_count=0, - ) + prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size)) + # Do not add invalid cases (see above for note on prefill shapes). + if size > prefill_req_count: + add_if_valid( + token_count=size, + prefill_req_count=prefill_req_count, + decode_req_count=0, + ) # Create decode-only dimensions with optimized token counts for size in cuda_graph_decode_token_counts: From 33e65c404256b75252b2b26bc9be289ede649d5c Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Mon, 18 May 2026 13:27:31 -0700 Subject: [PATCH 14/15] fix tests and recover small graphs for perf --- .../core/inference/batch_dimensions_utils.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index e3e8264b3b0..e54b20e2418 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -476,10 +476,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int max_requests - prefill_req_count, ), ) - # Skip pure-prefill shapes where each prefill request has <= 1 token. The model - # has no prompt to attend over so the graph isn't useful and it triggers a - # vectorized_gather_kernel OOB at capture time when token_count == 1. - if decode_req_count == 0 and size <= prefill_req_count: + # Skip token_count == 1 with prefill_req == 1: the gather kernel asserts + # on index >= 1 against a 1-element tensor at capture time. Larger + # `(size, size, 0)` shapes (each prefill = 1 token, total batch >= 2) are + # fine because the gather has multiple indices to read. + if size < 2: continue add_if_valid( token_count=size, @@ -489,15 +490,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int # We need to ensure the prefill requests are shorter than the max sequence length, # considering the one decode token is used for prefill request construction prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1))) - if prefill_only_minimal_num < max_requests: + if prefill_only_minimal_num < max_requests and size >= 2: prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size)) - # Do not add invalid cases (see above for note on prefill shapes). - if size > prefill_req_count: - add_if_valid( - token_count=size, - prefill_req_count=prefill_req_count, - decode_req_count=0, - ) + add_if_valid( + token_count=size, prefill_req_count=prefill_req_count, decode_req_count=0 + ) # Create decode-only dimensions with optimized token counts for size in cuda_graph_decode_token_counts: From 8561960aedf3080031ca6b876d015c54ec1599d4 Mon Sep 17 00:00:00 2001 From: Helen Ngo Date: Tue, 19 May 2026 16:31:01 -0700 Subject: [PATCH 15/15] keshav fixes --- .../core/inference/batch_dimensions_utils.py | 178 +++++++++++++----- megatron/core/inference/config.py | 28 ++- .../inference/contexts/dynamic_context.py | 1 + .../core/inference/engines/dynamic_engine.py | 4 +- megatron/inference/utils.py | 4 + megatron/training/arguments.py | 8 + 6 files changed, 175 insertions(+), 48 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index e54b20e2418..1229d333d0a 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -217,34 +217,59 @@ class CUDAGraphBatchDimensionBuilder: @staticmethod def _calculate_cuda_graph_token_counts( - tp_size: int, num_cuda_graphs: int, cuda_graph_max_tokens: int + tp_size: int, + num_cuda_graphs: int, + cuda_graph_max_tokens: int, + sizing_distribution: "CudaGraphSizingDistribution" = None, ) -> List[int]: """ Calculate CUDA graph token counts for a given configuration. - This method computes exponentially-decreasing token counts (powers of 2) - from cuda_graph_max_tokens down to CUDA_GRAPH_ROUNDER, ensuring proper - rounding and TP alignment. + Dispatches on `sizing_distribution`: + - EXPONENTIAL (default): halves from cuda_graph_max_tokens down to tp_size, log-spaced, + creates log2(max_tokens) graphs. + - LINEAR: small graphs [1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16); + explicit-N path uses even 16-stride from 0 to max. Args: tp_size: Tensor parallel size (for alignment) - num_cuda_graphs: Number of CUDA graphs to generate (must be >= 1) + num_cuda_graphs: Number of CUDA graphs to generate (must be >= 1, or -1 to auto-size) cuda_graph_max_tokens: Maximum token count for CUDA graphs (must be > 0) + sizing_distribution: Distribution of cudagraph sizes. Defaults to EXPONENTIAL. Returns: List of token counts in descending order - Example: - >>> _calculate_cuda_graph_token_counts - (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128) + Example (EXPONENTIAL): + >>> _calculate_cuda_graph_token_counts(tp_size=1, num_cuda_graphs=8, + cuda_graph_max_tokens=128) [128, 64, 32, 16, 8, 4, 2, 1] """ + from megatron.core.inference.config import CudaGraphSizingDistribution + + if sizing_distribution is None: + sizing_distribution = CudaGraphSizingDistribution.EXPONENTIAL + + if sizing_distribution == CudaGraphSizingDistribution.LINEAR: + return CUDAGraphBatchDimensionBuilder._calculate_token_counts_linear( + tp_size, num_cuda_graphs, cuda_graph_max_tokens + ) + + # Default path: exponential decay. if num_cuda_graphs == -1: - # Each step in the exponential-decay loop below halves the cudagraph size, so we need - # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim. - auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2) + # Pick a graph count: we halve from cuda_graph_max_tokens down to 1, so + # log2(max_tokens) halvings are needed. Add a small margin for the two forced endpoints + # (cuda_graph_max_tokens and tp_size) that are unioned into the set after the loop. + # Floor at MIN_GRAPHS so the trim logic always has at least 2 entries to work with. + HEADROOM = 2 + MIN_GRAPHS = 4 + num_halvings = int(math.log2(max(2, cuda_graph_max_tokens))) + auto_n = max(MIN_GRAPHS, num_halvings + HEADROOM) return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts( - tp_size=tp_size, num_cuda_graphs=auto_n, cuda_graph_max_tokens=cuda_graph_max_tokens + tp_size=tp_size, + num_cuda_graphs=auto_n, + cuda_graph_max_tokens=cuda_graph_max_tokens, + sizing_distribution=sizing_distribution, ) assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" @@ -270,30 +295,28 @@ def _calculate_cuda_graph_token_counts( if num_cuda_graphs == 1: return [cuda_graph_max_tokens] - # Exponentially decreasing, stops after num_cuda_graphs entries - # or when below the minimum size. - cuda_graph_token_counts = [] + # Exponentially decreasing token counts: halve from max_tokens until below the rounder floor + # or num_cuda_graphs. Dedupe (the rounding/TP-alignment can collide for small values), + # then sort descending. + sizes = set() val = cuda_graph_max_tokens for _ in range(num_cuda_graphs): # Round down to multiple of rounder, then up to multiple of TP size rounded = max(rounder, (val // rounder) * rounder) rounded = math.ceil(rounded / tp_size) * tp_size - if rounded not in cuda_graph_token_counts: - cuda_graph_token_counts.append(rounded) + sizes.add(rounded) val //= 2 if val < 1: break - # Ensure cuda_graph_max_tokens is always included - if cuda_graph_token_counts[0] != cuda_graph_max_tokens: - cuda_graph_token_counts.insert(0, cuda_graph_max_tokens) + # Always include the endpoints: cuda_graph_max_tokens (largest) and tp_size (smallest). + sizes.add(cuda_graph_max_tokens) + sizes.add(tp_size) - # Include a (possibly extra) size-1 graph - if cuda_graph_token_counts[-1] != tp_size: - cuda_graph_token_counts.append(tp_size) + cuda_graph_token_counts = sorted(sizes, reverse=True) - # Trim from the middle if we exceed num_cuda_graphs requested by the user - # Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements. + # Trim from the middle if we exceed num_cuda_graphs requested by the user. + # Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements. while len(cuda_graph_token_counts) > num_cuda_graphs: cuda_graph_token_counts.pop(-2) @@ -302,6 +325,51 @@ def _calculate_cuda_graph_token_counts( return cuda_graph_token_counts + @staticmethod + def _calculate_token_counts_linear( + tp_size: int, num_cuda_graphs: int, cuda_graph_max_tokens: int + ) -> List[int]: + """Linear-stride token count distribution. + + For num_cuda_graphs == -1, returns [1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16) + TP-aligned and deduped. + For positive N, returns evenly-spaced sizes with step ~ max_tokens / N. + """ + rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER + + if num_cuda_graphs == -1: + sizes = ( + [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16)) + ) + # TP-align and dedupe in order; preserve original ordering for parity. + sizes = list(dict.fromkeys(round_up_to_nearest_multiple(s, tp_size) for s in sizes)) + sizes = [s for s in sizes if s <= cuda_graph_max_tokens] + if not sizes or sizes[-1] != cuda_graph_max_tokens: + sizes.append(cuda_graph_max_tokens) + sizes.reverse() + return sizes + + assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" + assert ( + cuda_graph_max_tokens > 0 + ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}" + + # Even stride: step = round_up_to(max / N, rounder), TP-aligned. + step = cuda_graph_max_tokens / num_cuda_graphs + step = rounder * int(math.ceil(int(step) / rounder)) + step = round_up_to_nearest_multiple(step, tp_size) + step = max(step, tp_size) + cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size + + if num_cuda_graphs == 1: + return [cuda_graph_max_tokens] + + sizes = list(range(step, cuda_graph_max_tokens, step)) + if not sizes or sizes[-1] != cuda_graph_max_tokens: + sizes.append(cuda_graph_max_tokens) + sizes.reverse() + return sizes + @staticmethod def generate_cuda_graph_batch_dimensions_list( tp_size: int, @@ -313,6 +381,7 @@ def generate_cuda_graph_batch_dimensions_list( max_sequence_length: int, use_cuda_graphs_for_non_decode_steps: bool, num_speculative_tokens: int = 0, + sizing_distribution: "CudaGraphSizingDistribution" = None, ) -> Tuple[List[InferenceBatchDimensions], Optional[List[int]]]: """ Generate CUDA graph batch dimensions. @@ -370,6 +439,12 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int cuda_graph_decode_token_counts = None if num_cuda_graphs is not None: + # Lazy import to avoid a circular dependency with config.py. + from megatron.core.inference.config import CudaGraphSizingDistribution + + if sizing_distribution is None: + sizing_distribution = CudaGraphSizingDistribution.EXPONENTIAL + # Ensure valid num_cuda_graphs. if ( cuda_graph_max_tokens is None @@ -396,6 +471,7 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int tp_size=tp_size, num_cuda_graphs=num_cuda_graphs, cuda_graph_max_tokens=cuda_graph_max_tokens, + sizing_distribution=sizing_distribution, ) ) @@ -408,12 +484,19 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int tp_size=tp_size, num_cuda_graphs=num_cuda_graphs, cuda_graph_max_tokens=cuda_graph_max_tokens_decode, + sizing_distribution=sizing_distribution, ) ) # Include the smallest decode-only graphs when auto-sizing (num_cuda_graphs == -1). - # Without this, TP alignment (size 1 -> tp_size) and the num_speculative_tokens floor - # division may drop the size 1 and size 2 graph sizes. + # Without this, TP alignment and the num_speculative_tokens floor division can drop + # the smallest 1- and 2-request shapes from the captured set. + # + # The minimum valid decode token_count is lcm(spec_unit, tp_size): + # - Ensure divisible by tp_size (required so TP / sequence-parallel never produces a + # single-token graph when tp_size > 1). + # - Ensure a multiple of (spec+1) so it accommodates an integer number of decode + # requests when speculative decoding is enabled. if num_cuda_graphs == -1: spec_unit = num_speculative_tokens + 1 min_decode_tokens = math.lcm(spec_unit, tp_size) @@ -447,23 +530,28 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int else: # Mixed prefill and decode mode. # - # Generate mixed CGs across a geometric P-grid rather than a single fixed P - # value. A captured graph at P=k bakes in a token layout for k prefill slots; - # a real batch with P != k pays for unused slot padding, which can overshoot - # the captured token_count and make the graph unusable. Geometric spacing - # bounds the relative overhead per real batch (worst case ~2x P slack) while - # keeping the total CG count log-bounded. + # Under EXPONENTIAL distribution (default): generate mixed CGs across a + # geometric P-grid {1, 2, 4, ..., max_requests}. This bounds the relative + # overhead per real batch (~2x P slack worst case) and is the structural fix + # that makes mixed CGs usable for real batches with P != fixed_P. # - # `cuda_graph_mixed_prefill_request_count` is now used only as an on/off - # toggle for mixed CGs (>0 enables, <=0 routes to decode-only above). The - # P value it used to specify is superseded by the grid. - p_values = [] - p = 1 - while p < max_requests: - p_values.append(p) - p *= 2 - if not p_values or p_values[-1] != max_requests: - p_values.append(max_requests) + # Under LINEAR distribution: use the legacy fixed P value + # (cuda_graph_mixed_prefill_request_count) — same single-P behavior main has + # today, for apples-to-apples benchmarking against vLLM-style configurations. + if sizing_distribution == CudaGraphSizingDistribution.LINEAR: + p_values = [min(cuda_graph_mixed_prefill_request_count, max_requests)] + # In legacy mode, the prefill-only floor uses the fixed P value to match + # main's behavior exactly. + prefill_only_floor = cuda_graph_mixed_prefill_request_count + else: + p_values = [] + p = 1 + while p < max_requests: + p_values.append(p) + p *= 2 + if not p_values or p_values[-1] != max_requests: + p_values.append(max_requests) + prefill_only_floor = 1 # Create prefill and mixed dimensions with full token counts for size in cuda_graph_prefill_token_counts: @@ -489,7 +577,9 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int ) # We need to ensure the prefill requests are shorter than the max sequence length, # considering the one decode token is used for prefill request construction - prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1))) + prefill_only_minimal_num = max( + prefill_only_floor, math.ceil(size / max(1, max_sequence_length - 1)) + ) if prefill_only_minimal_num < max_requests and size >= 2: prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size)) add_if_valid( diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py index e8769f3d6e7..ea4b08e5183 100644 --- a/megatron/core/inference/config.py +++ b/megatron/core/inference/config.py @@ -117,6 +117,22 @@ class KVCacheManagementMode(str, Enum): """Deallocate large tensors and recompute them from scratch during allocation.""" +class CudaGraphSizingDistribution(str, Enum): + """How CUDA graph token-count sizes are spaced when generating the captured graphs. + + EXPONENTIAL (default) — token counts halve from `cuda_graph_max_tokens` down to `tp_size`, + giving a log-spaced distribution. Bounded relative padding (~2x worst case) at every scale and + `log2(max_tokens)` total graphs. + + LINEAR — Include size-1 and size-2 graphs where applicable, linear spacing up until 256, and + sparser linear spacing past 256. e.g. `[1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16)`. + Higher graph density at the top end. + """ + + EXPONENTIAL = "exponential" + LINEAR = "linear" + + @dataclass class InferenceConfig: """ @@ -197,10 +213,20 @@ class InferenceConfig: """ cuda_graph_mixed_prefill_count: Optional[int] = 16 - """ + """ The number of mixed prefill graphs to capture if mixed prefill/decode graphs are enabled. """ + cuda_graph_sizing_distribution: CudaGraphSizingDistribution = ( + CudaGraphSizingDistribution.EXPONENTIAL + ) + """ + How CUDA graph token counts are spaced. EXPONENTIAL (default) halves from + `cuda_graph_max_tokens` down to `tp_size` (log-spaced, ~log2(max_tokens) graphs). + LINEAR uses a range of linear strides (includes small graphs + mid-range linearity + + a bigger step size at the top end). + """ + use_cuda_graphs_for_non_decode_steps: bool = True """ Whether to use CUDA graphs for non-decode steps. diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index 6c2dcb47340..8fe02158b1b 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -644,6 +644,7 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC max_sequence_length=self.max_sequence_length, use_cuda_graphs_for_non_decode_steps=self.use_cuda_graphs_for_non_decode_steps, num_speculative_tokens=self.num_speculative_tokens, + sizing_distribution=inference_config.cuda_graph_sizing_distribution, ) ) diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 8d955c0b001..8a43eb0f7ae 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -142,7 +142,7 @@ def format_mem_bytes(mem_bytes): return "%d bytes" % mem_bytes -def _cuda_graph_mempool_bytes(): +def _cuda_graph_mempool_bytes() -> Tuple[int, int]: """Return (reserved, allocated) bytes belonging to the global CUDA graph mempool. PyTorch's `torch.cuda.memory_stats()` reports process-wide totals that mix in @@ -374,7 +374,6 @@ def create_cuda_graphs(self, reset_context: bool = True): controller = self.controller time_start = time.time() - torch.cuda.reset_peak_memory_stats() mem_stats_start = torch.cuda.memory_stats() # Snapshot of process-wide stats for the "total memory used by capture" summary. @@ -479,7 +478,6 @@ def create_cuda_graphs(self, reset_context: bool = True): format_mem_bytes(pool_alloc - prev_pool_alloc), ) prev_pool_reserved, prev_pool_alloc = pool_reserved, pool_alloc - torch.cuda.reset_peak_memory_stats() if mtp_warmup_enabled and mtp_seen_batch_sizes: logging.info("> MTP CUDA graph warmup: %d batch size(s)", len(mtp_seen_batch_sizes)) diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py index 3f06eb8a301..60b6d9bb0c0 100644 --- a/megatron/inference/utils.py +++ b/megatron/inference/utils.py @@ -9,6 +9,7 @@ from gpt_builders import gpt_builder from hybrid_builders import hybrid_builder from megatron.core.inference.config import ( + CudaGraphSizingDistribution, InferenceConfig, KVCacheManagementMode, MambaInferenceStateConfig, @@ -356,6 +357,9 @@ def get_inference_config_from_model_and_args(model: MegatronModule, args): unified_memory_level=args.inference_dynamic_batching_unified_memory_level, kv_cache_management_mode=KVCacheManagementMode(args.rl_kv_cache_management_mode), cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, # pylint: disable=line-too-long + cuda_graph_sizing_distribution=CudaGraphSizingDistribution( + args.inference_dynamic_batching_cuda_graph_sizing_distribution + ), use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, cuda_graph_all_prefills=args.inference_cuda_graph_all_prefills, static_kv_memory_pointers=args.rl_persist_cuda_graphs, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index f3c2ded6907..948f8091814 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1963,6 +1963,14 @@ def _add_inference_args(parser): group.add_argument('--inference-dynamic-batching-cuda-graph-mixed-prefill-count', type=int, default=16, help='Number of mixed prefill requests to capture in a cuda graph.') + group.add_argument('--inference-dynamic-batching-cuda-graph-sizing-distribution', + type=str, default='exponential', + choices=['exponential', 'linear'], + dest='inference_dynamic_batching_cuda_graph_sizing_distribution', + help='Spacing of CUDA graph token counts. "exponential" (default) ' + 'halves from cuda_graph_max_tokens down to tp_size, giving a ' + 'log-spaced distribution with bounded relative padding. ' + '"linear" uses varying linear strides across the range.') group.add_argument('--inference-dynamic-batching-sampling-backend', type=str, default='torch', choices=['torch', 'flashinfer'],