From a09988a45731099ffb5436d753c806529cfa3f74 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Thu, 19 Feb 2026 17:54:24 -0800
Subject: [PATCH 01/15] Change from linear to exponentially decay cudagraph
 sizes

---
 .../core/inference/batch_dimensions_utils.py  | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 77354d59320..240844a44d1 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -210,7 +210,7 @@ class CUDAGraphBatchDimensionBuilder:
     """
 
     # Constant for rounding token counts when generating CUDA graph batch dimensions
-    CUDA_GRAPH_ROUNDER = 8
+    CUDA_GRAPH_ROUNDER = 2
 
     @staticmethod
     def _calculate_cuda_graph_token_counts(
@@ -219,8 +219,9 @@ def _calculate_cuda_graph_token_counts(
         """
         Calculate CUDA graph token counts for a given configuration.
 
-        This method computes evenly-spaced token counts from step_size up to
-        cuda_graph_max_tokens, ensuring proper rounding and TP alignment.
+        This method computes exponentially-decreasing token counts (powers of 2)
+        from cuda_graph_max_tokens down to CUDA_GRAPH_ROUNDER, ensuring proper
+        rounding and TP alignment.
 
         Args:
             tp_size: Tensor parallel size (for alignment)
@@ -232,38 +233,35 @@ def _calculate_cuda_graph_token_counts(
 
         Example:
             >>> _calculate_cuda_graph_token_counts
-            (tp_size=2, num_cuda_graphs=4, cuda_graph_max_tokens=1000)
-            [1000, 752, 504, 256]
+            (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128)
+            [128, 64, 32, 16, 8, 4, 2]
         """
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
         assert (
             cuda_graph_max_tokens > 0
         ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}"
 
-        # Cuda graph step size.
-        cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs
-        cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int(
-            math.ceil(int(cuda_graph_step_size) / CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER)
-        )
-        # Make sure divisible by TP size
-        cuda_graph_step_size = math.ceil(cuda_graph_step_size / tp_size) * tp_size
+        rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER
 
-        # round down cuda graph max tokens to be multiple of TP size
+        # Round down cuda graph max tokens to be multiple of TP size
         cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size
 
-        # Cuda graph token counts.
         if num_cuda_graphs == 1:
-            cuda_graph_token_counts = [cuda_graph_max_tokens]
-        else:
-            cuda_graph_token_counts = list(
-                range(cuda_graph_step_size, cuda_graph_max_tokens, cuda_graph_step_size)
-            )
-            if (
-                len(cuda_graph_token_counts) == 0
-                or cuda_graph_token_counts[-1] != cuda_graph_max_tokens
-            ):
-                cuda_graph_token_counts.append(cuda_graph_max_tokens)
-            cuda_graph_token_counts.reverse()
+            return [cuda_graph_max_tokens]
+
+        # Exponentially decreasing, stops after num_cuda_graphs entries
+        # or when below the minimum size.
+        cuda_graph_token_counts = []
+        val = cuda_graph_max_tokens
+        for _ in range(num_cuda_graphs):
+            # Round down to multiple of rounder, then up to multiple of TP size
+            rounded = max(rounder, (val // rounder) * rounder)
+            rounded = math.ceil(rounded / tp_size) * tp_size
+            if rounded not in cuda_graph_token_counts:
+                cuda_graph_token_counts.append(rounded)
+            val //= 2
+            if val < rounder:
+                break
 
         return cuda_graph_token_counts
 

From 20798afd428dd39e5c9ec3f3e2e13aacee8c5211 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Thu, 19 Feb 2026 18:14:24 -0800
Subject: [PATCH 02/15] Maybe include a size-1 graph

---
 megatron/core/inference/batch_dimensions_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 240844a44d1..1c2a56c1692 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -263,6 +263,11 @@ def _calculate_cuda_graph_token_counts(
             if val < rounder:
                 break
 
+        # Include a (possibly extra) size-1 graph
+        min_token_count = math.ceil(1 / tp_size) * tp_size
+        if cuda_graph_token_counts[-1] != min_token_count:
+            cuda_graph_token_counts.append(min_token_count)
+
         return cuda_graph_token_counts
 
     @staticmethod

From 3c718e9ee5f9c2d5efc6604118b7107d8baafe96 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Fri, 20 Feb 2026 10:11:16 -0800
Subject: [PATCH 03/15] Update test_cuda_graph_token_counts

---
 .../inference/engines/test_dynamic_engine.py           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
index d71ccccd49a..f84c74f5a31 100644
--- a/tests/unit_tests/inference/engines/test_dynamic_engine.py
+++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -694,11 +694,11 @@ def test_cuda_graph_token_counts(self) -> None:
         for num_cuda_graphs, expected_cuda_graph_token_counts in [
             (0, [80]),
             (1, [80]),
-            (2, [80, 40]),
-            (4, [80, 72, 48, 24]),
-            (8, [80, 64, 48, 32, 16]),
-            (16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
-            (32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
+            (2, [80, 40, 1]),
+            (4, [80, 40, 20, 10, 1]),
+            (8, [80, 40, 20, 10, 4, 2, 1]),
+            (16, [80, 40, 20, 10, 4, 2, 1]),
+            (32, [80, 40, 20, 10, 4, 2, 1]),
         ]:
 
             # Build cuda graphs (inside dynamic engine).

From cafe6af237b55c59ea245eb7f25604f4a1a01092 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 23 Feb 2026 08:13:31 -0800
Subject: [PATCH 04/15] address comments

---
 .../core/inference/batch_dimensions_utils.py  | 16 ++++++--
 .../inference/engines/test_dynamic_engine.py  | 38 ++++++++++++++++++-
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 1c2a56c1692..fd00338225d 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -234,7 +234,7 @@ def _calculate_cuda_graph_token_counts(
         Example:
             >>> _calculate_cuda_graph_token_counts
             (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128)
-            [128, 64, 32, 16, 8, 4, 2]
+            [128, 64, 32, 16, 8, 4, 2, 1]
         """
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
         assert (
@@ -251,6 +251,7 @@ def _calculate_cuda_graph_token_counts(
 
         # Exponentially decreasing, stops after num_cuda_graphs entries
         # or when below the minimum size.
+        # TODO(helenn/lmcafee): Extend upper range of distribution to be linearly-spaced.
         cuda_graph_token_counts = []
         val = cuda_graph_max_tokens
         for _ in range(num_cuda_graphs):
@@ -263,10 +264,17 @@ def _calculate_cuda_graph_token_counts(
             if val < rounder:
                 break
 
+        # Ensure cuda_graph_max_tokens is always included
+        if cuda_graph_token_counts[0] != cuda_graph_max_tokens:
+            cuda_graph_token_counts.insert(0, cuda_graph_max_tokens)
+
         # Include a (possibly extra) size-1 graph
-        min_token_count = math.ceil(1 / tp_size) * tp_size
-        if cuda_graph_token_counts[-1] != min_token_count:
-            cuda_graph_token_counts.append(min_token_count)
+        if cuda_graph_token_counts[-1] != tp_size:
+            cuda_graph_token_counts.append(tp_size)
+
+        # Trim from the middle if we exceed num_cuda_graphs requested by the user
+        while len(cuda_graph_token_counts) > num_cuda_graphs:
+            cuda_graph_token_counts.pop(-2)
 
         return cuda_graph_token_counts
 
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
index f84c74f5a31..524b7efa8df 100644
--- a/tests/unit_tests/inference/engines/test_dynamic_engine.py
+++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -15,6 +15,7 @@
 from transformer_engine.pytorch.fp8 import check_fp8_support
 
 from megatron.core import parallel_state
+from megatron.core.inference.batch_dimensions_utils import CUDAGraphBatchDimensionBuilder
 from megatron.core.inference.config import (
     InferenceConfig,
     KVCacheManagementMode,
@@ -694,8 +695,8 @@ def test_cuda_graph_token_counts(self) -> None:
         for num_cuda_graphs, expected_cuda_graph_token_counts in [
             (0, [80]),
             (1, [80]),
-            (2, [80, 40, 1]),
-            (4, [80, 40, 20, 10, 1]),
+            (2, [80, 1]),
+            (4, [80, 40, 20, 1]),
             (8, [80, 40, 20, 10, 4, 2, 1]),
             (16, [80, 40, 20, 10, 4, 2, 1]),
             (32, [80, 40, 20, 10, 4, 2, 1]),
@@ -716,6 +717,39 @@ def test_cuda_graph_token_counts(self) -> None:
                 actual_cuda_graph_token_counts,
             )
 
+    @pytest.mark.internal
+    @pytest.mark.parametrize(
+        "tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected",
+        [
+            # TP=1
+            (1, 1, 80, [80]),
+            (1, 2, 80, [80, 1]),
+            (1, 4, 80, [80, 40, 20, 1]),
+            (1, 8, 80, [80, 40, 20, 10, 4, 2, 1]),
+            (1, 16, 80, [80, 40, 20, 10, 4, 2, 1]),
+            # TP=2
+            (2, 1, 80, [80]),
+            (2, 2, 80, [80, 2]),
+            (2, 4, 80, [80, 40, 20, 2]),
+            (2, 8, 80, [80, 40, 20, 10, 4, 2]),
+            (2, 16, 80, [80, 40, 20, 10, 4, 2]),
+        ],
+    )
+    def test_calculate_cuda_graph_token_counts(
+        self, tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected
+    ):
+        """Test _calculate_cuda_graph_token_counts for various TP sizes."""
+        actual = CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
+            tp_size=tp_size,
+            num_cuda_graphs=num_cuda_graphs,
+            cuda_graph_max_tokens=cuda_graph_max_tokens,
+        )
+        assert actual == expected, (
+            f"tp_size={tp_size}, num_cuda_graphs={num_cuda_graphs}, "
+            f"cuda_graph_max_tokens={cuda_graph_max_tokens}: "
+            f"expected {expected}, got {actual}"
+        )
+
     @pytest.mark.internal
     @pytest.mark.skipif(
         not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"

From 96f527833565ce08ab7c191ac6b97d23ba0945fb Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 23 Feb 2026 12:24:08 -0800
Subject: [PATCH 05/15] keshav comments

---
 megatron/core/inference/batch_dimensions_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index fd00338225d..1bc1475e6bf 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -273,9 +273,13 @@ def _calculate_cuda_graph_token_counts(
             cuda_graph_token_counts.append(tp_size)
 
         # Trim from the middle if we exceed num_cuda_graphs requested by the user
+        #  Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements.
         while len(cuda_graph_token_counts) > num_cuda_graphs:
             cuda_graph_token_counts.pop(-2)
 
+        assert len(cuda_graph_token_counts) == num_cuda_graphs
+        assert cuda_graph_max_tokens in cuda_graph_token_counts
+
         return cuda_graph_token_counts
 
     @staticmethod

From ad6753e544288ed002538b73be736f0240bf3085 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Tue, 10 Mar 2026 08:14:10 -0700
Subject: [PATCH 06/15] address comments

---
 megatron/core/inference/batch_dimensions_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 1bc1475e6bf..020c7bbf62e 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -261,7 +261,7 @@ def _calculate_cuda_graph_token_counts(
             if rounded not in cuda_graph_token_counts:
                 cuda_graph_token_counts.append(rounded)
             val //= 2
-            if val < rounder:
+            if val < 1:
                 break
 
         # Ensure cuda_graph_max_tokens is always included
@@ -277,7 +277,7 @@ def _calculate_cuda_graph_token_counts(
         while len(cuda_graph_token_counts) > num_cuda_graphs:
             cuda_graph_token_counts.pop(-2)
 
-        assert len(cuda_graph_token_counts) == num_cuda_graphs
+        assert len(cuda_graph_token_counts) <= num_cuda_graphs
         assert cuda_graph_max_tokens in cuda_graph_token_counts
 
         return cuda_graph_token_counts

From 6ea8abc34125f7bd47d744a4288afc29ae0203e4 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 08:04:56 -0700
Subject: [PATCH 07/15] update print in example script to differentiate
 reserved / allocated

---
 examples/inference/gpt/gpt_dynamic_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py
index 02a257c1b46..9172d137eab 100644
--- a/examples/inference/gpt/gpt_dynamic_inference.py
+++ b/examples/inference/gpt/gpt_dynamic_inference.py
@@ -508,7 +508,7 @@ def escape_str(s):
         print(
             f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ",
             f"total time: {total_time:.3f}s … "
-            f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
+            f"mem {peak_alloc_gb:.1f} allocated/{peak_resvd_gb:.1f} reserved GB … "
             f"steps: {engine.context.step_count:d} … "
             f"capture {capture_str}",
         )

From c06c3274c0c2d0e6e04c733652cbde0b2aad4f98 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 08:11:28 -0700
Subject: [PATCH 08/15] exponential decay of graph size

---
 .../core/inference/batch_dimensions_utils.py  | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 347995044c9..4d5e09b41bf 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -240,31 +240,22 @@ def _calculate_cuda_graph_token_counts(
             [128, 64, 32, 16, 8, 4, 2, 1]
         """
         if num_cuda_graphs == -1:
-            # automatically determine the number of CUDA graphs to
-            # capture based on the `max_requests` value
-            cuda_graph_token_counts = (
-                [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16))
+            # Each step in the exponential-decay loop below halves the cudagraph size, so we need
+            # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim.
+            auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2)
+            return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
+                tp_size=tp_size,
+                num_cuda_graphs=auto_n,
+                cuda_graph_max_tokens=cuda_graph_max_tokens,
             )
-            # Align each entry to TP size
-            cuda_graph_token_counts = list(
-                dict.fromkeys(
-                    round_up_to_nearest_multiple(s, tp_size) for s in cuda_graph_token_counts
-                )
-            )
-            # Clamp to max tokens
-            cuda_graph_token_counts = [
-                s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens
-            ]
-            if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens:
-                cuda_graph_token_counts.append(cuda_graph_max_tokens)
-            cuda_graph_token_counts.reverse()
-            return cuda_graph_token_counts
 
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
         assert (
             cuda_graph_max_tokens > 0
         ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}"
 
+        rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER
+
         # Cuda graph step size.
         cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs
         cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int(
@@ -423,6 +414,23 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 )
             )
 
+            # Include the smallest decode-only graphs when auto-sizing (num_cuda_graphs == -1).
+            # Without this, TP alignment (size 1 -> tp_size) and the num_speculative_tokens floor
+            # division may drop the size 1 and size 2 graph sizes.
+            if num_cuda_graphs == -1:
+                spec_unit = num_speculative_tokens + 1
+                min_decode_tokens = math.lcm(spec_unit, tp_size)
+                for req_count_multiple in (1, 2):
+                    floor_tokens = min_decode_tokens * req_count_multiple
+                    if (
+                        floor_tokens <= cuda_graph_max_tokens_decode
+                        and floor_tokens not in cuda_graph_decode_token_counts
+                    ):
+                        cuda_graph_decode_token_counts.append(floor_tokens)
+                cuda_graph_decode_token_counts = sorted(
+                    set(cuda_graph_decode_token_counts), reverse=True
+                )
+
         cuda_graph_batch_dimensions_list = []
         if num_cuda_graphs is None:
             cuda_graph_batch_dimensions_list = []

From b508d08a90e2d96a2678c323b5eedfcf3dc31278 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 10:10:30 -0700
Subject: [PATCH 09/15] better logging for checking pool reuse etc

---
 .../core/inference/batch_dimensions_utils.py  | 57 +++++++++------
 .../core/inference/engines/dynamic_engine.py  | 73 +++++++++++++++++--
 .../inference/engines/test_dynamic_engine.py  | 59 ++++-----------
 3 files changed, 116 insertions(+), 73 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 4d5e09b41bf..1ecdfa87d07 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -244,9 +244,7 @@ def _calculate_cuda_graph_token_counts(
             # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim.
             auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2)
             return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
-                tp_size=tp_size,
-                num_cuda_graphs=auto_n,
-                cuda_graph_max_tokens=cuda_graph_max_tokens,
+                tp_size=tp_size, num_cuda_graphs=auto_n, cuda_graph_max_tokens=cuda_graph_max_tokens
             )
 
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
@@ -274,7 +272,6 @@ def _calculate_cuda_graph_token_counts(
 
         # Exponentially decreasing, stops after num_cuda_graphs entries
         # or when below the minimum size.
-        # TODO(helenn/lmcafee): Extend upper range of distribution to be linearly-spaced.
         cuda_graph_token_counts = []
         val = cuda_graph_max_tokens
         for _ in range(num_cuda_graphs):
@@ -448,29 +445,45 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                     token_count=token_count, prefill_req_count=0, decode_req_count=decode_req_count
                 )
         else:
-            # Mixed prefill and decode mode
+            # Mixed prefill and decode mode.
+            #
+            # Generate mixed CGs across a geometric P-grid rather than a single fixed P
+            # value. A captured graph at P=k bakes in a token layout for k prefill slots;
+            # a real batch with P != k pays for unused slot padding, which can overshoot
+            # the captured token_count and make the graph unusable. Geometric spacing
+            # bounds the relative overhead per real batch (worst case ~2x P slack) while
+            # keeping the total CG count log-bounded.
+            #
+            # `cuda_graph_mixed_prefill_request_count` is now used only as an on/off
+            # toggle for mixed CGs (>0 enables, <=0 routes to decode-only above). The
+            # P value it used to specify is superseded by the grid.
+            p_values = []
+            p = 1
+            while p < max_requests:
+                p_values.append(p)
+                p *= 2
+            if not p_values or p_values[-1] != max_requests:
+                p_values.append(max_requests)
+
             # Create prefill and mixed dimensions with full token counts
             for size in cuda_graph_prefill_token_counts:
                 assert size % tp_size == 0
-                prefill_req_count = min(cuda_graph_mixed_prefill_request_count, max_requests)
-                decode_req_count = max(
-                    0,
-                    min(
-                        (size - prefill_req_count) // (num_speculative_tokens + 1),
-                        max_requests - prefill_req_count,
-                    ),
-                )
-                add_if_valid(
-                    token_count=size,
-                    prefill_req_count=prefill_req_count,
-                    decode_req_count=decode_req_count,
-                )
+                for prefill_req_count in p_values:
+                    decode_req_count = max(
+                        0,
+                        min(
+                            (size - prefill_req_count) // (num_speculative_tokens + 1),
+                            max_requests - prefill_req_count,
+                        ),
+                    )
+                    add_if_valid(
+                        token_count=size,
+                        prefill_req_count=prefill_req_count,
+                        decode_req_count=decode_req_count,
+                    )
                 # We need to ensure the prefill requests are shorter than the max sequence length,
                 # considering the one decode token is used for prefill request construction
-                prefill_only_minimal_num = max(
-                    cuda_graph_mixed_prefill_request_count,
-                    math.ceil(size / max(1, max_sequence_length - 1)),
-                )
+                prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1)))
                 if prefill_only_minimal_num < max_requests:
                     add_if_valid(
                         token_count=size,
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
index 39aa21d02e7..992207fca2d 100644
--- a/megatron/core/inference/engines/dynamic_engine.py
+++ b/megatron/core/inference/engines/dynamic_engine.py
@@ -44,7 +44,7 @@
 )
 from megatron.core.inference.utils import Counter, InferenceMode, await_process_call
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.cuda_graphs import delete_cuda_graphs
+from megatron.core.transformer.cuda_graphs import CudaGraphManager, delete_cuda_graphs
 from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
 from megatron.core.utils import (
@@ -133,6 +133,8 @@ class EngineSuspendedError(Exception):
 
 def format_mem_bytes(mem_bytes):
     """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes."""
+    if mem_bytes < 0:
+        return "-" + format_mem_bytes(-mem_bytes)
     for power, suffix in [(4, "tb"), (3, "gb"), (2, "mb"), (1, "kb"), (0, "bytes")]:
         suffix_bytes = 1024**power
         if mem_bytes >= suffix_bytes:
@@ -140,6 +142,33 @@ def format_mem_bytes(mem_bytes):
     return "%d bytes" % mem_bytes
 
 
+def _cuda_graph_mempool_bytes():
+    """Return (reserved, allocated) bytes belonging to the global CUDA graph mempool.
+
+    PyTorch's `torch.cuda.memory_stats()` reports process-wide totals that mix in
+    every other allocation (KV cache, NCCL workspaces, layer scratch). To isolate
+    growth caused by graph capture, we walk `torch.cuda.memory_snapshot()` and
+    filter segments by their `segment_pool_id` against the graph pool handle.
+    Returns (0, 0) if the pool hasn't been created yet.
+    """
+    pool_id = CudaGraphManager.global_mempool
+    if pool_id is None:
+        return 0, 0
+    reserved = 0
+    allocated = 0
+    for seg in torch.cuda.memory_snapshot():
+        seg_pool_id = (
+            seg.get("segment_pool_id")
+            or seg.get("private_pool_id")
+            or seg.get("pool_id")
+            or seg.get("pool")
+        )
+        if seg_pool_id == pool_id:
+            reserved += seg.get("total_size", 0)
+            allocated += seg.get("allocated_size", 0)
+    return reserved, allocated
+
+
 @dataclass(kw_only=True)
 class RequestEntry:
     """Entry in the engine's `self.requests` dict."""
@@ -350,7 +379,15 @@ def create_cuda_graphs(self, reset_context: bool = True):
         controller = self.controller
 
         time_start = time.time()
+        torch.cuda.reset_peak_memory_stats()
         mem_stats_start = torch.cuda.memory_stats()
+        # Snapshot of process-wide stats for the "total memory used by capture"
+        # summary printed at the end of the loop.
+        start_proc_reserved = mem_stats_start["reserved_bytes.all.current"]
+        start_proc_alloc = mem_stats_start["allocated_bytes.all.current"]
+        # Pool-scoped baselines for the per-iteration deltas. These isolate
+        # actual CUDA-graph-mempool growth from unrelated scratch churn.
+        prev_pool_reserved, prev_pool_alloc = _cuda_graph_mempool_bytes()
 
         logging.info("> dynamic_engine.py: building cuda graphs for ")
         for graph in context.cuda_graph_batch_dimensions_list:
@@ -432,27 +469,49 @@ def create_cuda_graphs(self, reset_context: bool = True):
 
                 context.reset()
 
+            # Per-iteration memory accounting, scoped to the CUDA-graph mempool.
+            # This isolates pool growth from process-wide scratch churn (KV cache,
+            # NCCL workspaces, etc.) that pollutes `torch.cuda.memory_stats()`.
+            pool_reserved, pool_alloc = _cuda_graph_mempool_bytes()
+            logging.info(
+                "  [graph %d/%d] %s | pool reserved=%s (Δiter=%s) "
+                "pool allocated=%s (Δiter=%s)",
+                tbar_idx + 1,
+                len(context.cuda_graph_batch_dimensions_list),
+                cuda_graph_batch_dimension,
+                format_mem_bytes(pool_reserved),
+                format_mem_bytes(pool_reserved - prev_pool_reserved),
+                format_mem_bytes(pool_alloc),
+                format_mem_bytes(pool_alloc - prev_pool_alloc),
+            )
+            prev_pool_reserved, prev_pool_alloc = pool_reserved, pool_alloc
+            torch.cuda.reset_peak_memory_stats()
+
         if mtp_warmup_enabled and mtp_seen_batch_sizes:
             logging.info("> MTP CUDA graph warmup: %d batch size(s)", len(mtp_seen_batch_sizes))
 
         # Memory usage.
         time_end = time.time()
         mem_stats_end = torch.cuda.memory_stats()
+        final_pool_reserved, final_pool_alloc = _cuda_graph_mempool_bytes()
         capture_stats = {
             "time": time_end - time_start,
             "allocated_bytes": (
-                mem_stats_end["allocated_bytes.all.current"]
-                - mem_stats_start["allocated_bytes.all.current"]
+                mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc
             ),
             "reserved_bytes": (
-                mem_stats_end["reserved_bytes.all.current"]
-                - mem_stats_start["reserved_bytes.all.current"]
+                mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved
             ),
+            "pool_reserved_bytes": final_pool_reserved,
+            "pool_allocated_bytes": final_pool_alloc,
         }
         logging.info(
-            "> built cuda graph(s) in %.2f sec, with total memory usage: "
-            "allocated %s, reserved %s.",
+            "> built cuda graph(s) in %.2f sec. "
+            "Mempool: reserved %s, allocated %s. "
+            "Process-wide delta: allocated %s, reserved %s.",
             capture_stats["time"],
+            format_mem_bytes(capture_stats["pool_reserved_bytes"]),
+            format_mem_bytes(capture_stats["pool_allocated_bytes"]),
             format_mem_bytes(capture_stats["allocated_bytes"]),
             format_mem_bytes(capture_stats["reserved_bytes"]),
         )
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
index e549b72e78f..161c0eed509 100644
--- a/tests/unit_tests/inference/engines/test_dynamic_engine.py
+++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -17,7 +17,6 @@
 from transformer_engine.pytorch.fp8 import check_fp8_support
 
 from megatron.core import parallel_state
-from megatron.core.inference.batch_dimensions_utils import CUDAGraphBatchDimensionBuilder
 from megatron.core.inference.config import (
     InferenceConfig,
     KVCacheManagementMode,
@@ -794,23 +793,28 @@ def test_fixed_output_lengths(self, model_provider: str) -> None:
     def test_cuda_graph_token_counts(self, use_non_decode: bool) -> None:
         """Test initialization of `cuda_graph_token_counts` in dynamic context."""
 
+        # Exponential-decay graph distribution (halve from max down to tp_size).
+        # decode-only path: cuda_graph_max_tokens = max_requests * (spec+1) = 80.
+        # non-decode path: cuda_graph_max_tokens = self.max_tokens (DEFAULT 16384);
+        # most large prefill sizes are filtered by is_valid because
+        # token_count > prefill_req_count * (max_sequence_length - 1).
         decode_only_cases = [
             (0, [80]),
             (1, [80]),
-            (2, [80, 40]),
-            (4, [80, 72, 48, 24]),
-            (8, [80, 64, 48, 32, 16]),
-            (16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
-            (32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
+            (2, [80, 1]),
+            (4, [80, 40, 20, 1]),
+            (8, [80, 40, 20, 10, 4, 2, 1]),
+            (16, [80, 40, 20, 10, 4, 2, 1]),
+            (32, [80, 40, 20, 10, 4, 2, 1]),
         ]
         non_decode_cases = [
             (0, [80]),
             (1, [80]),
-            (2, [80, 40]),
-            (4, [80, 72, 48, 24]),
-            (8, [80, 64, 48, 32, 16]),
-            (16, [1024, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
-            (32, [1024, 512, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
+            (2, [80, 1]),
+            (4, [80, 40, 20, 1]),
+            (8, [1024, 512, 256, 80, 40, 20, 10, 4, 2, 1]),
+            (16, [1024, 512, 256, 128, 80, 64, 40, 32, 20, 16, 10, 8, 4, 2, 1]),
+            (32, [1024, 512, 256, 128, 80, 64, 40, 32, 20, 16, 10, 8, 4, 2, 1]),
         ]
         cases = non_decode_cases if use_non_decode else decode_only_cases
 
@@ -837,39 +841,6 @@ def test_cuda_graph_token_counts(self, use_non_decode: bool) -> None:
                 )
             )
 
-    @pytest.mark.internal
-    @pytest.mark.parametrize(
-        "tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected",
-        [
-            # TP=1
-            (1, 1, 80, [80]),
-            (1, 2, 80, [80, 1]),
-            (1, 4, 80, [80, 40, 20, 1]),
-            (1, 8, 80, [80, 40, 20, 10, 4, 2, 1]),
-            (1, 16, 80, [80, 40, 20, 10, 4, 2, 1]),
-            # TP=2
-            (2, 1, 80, [80]),
-            (2, 2, 80, [80, 2]),
-            (2, 4, 80, [80, 40, 20, 2]),
-            (2, 8, 80, [80, 40, 20, 10, 4, 2]),
-            (2, 16, 80, [80, 40, 20, 10, 4, 2]),
-        ],
-    )
-    def test_calculate_cuda_graph_token_counts(
-        self, tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected
-    ):
-        """Test _calculate_cuda_graph_token_counts for various TP sizes."""
-        actual = CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
-            tp_size=tp_size,
-            num_cuda_graphs=num_cuda_graphs,
-            cuda_graph_max_tokens=cuda_graph_max_tokens,
-        )
-        assert actual == expected, (
-            f"tp_size={tp_size}, num_cuda_graphs={num_cuda_graphs}, "
-            f"cuda_graph_max_tokens={cuda_graph_max_tokens}: "
-            f"expected {expected}, got {actual}"
-        )
-
     @pytest.mark.internal
     @pytest.mark.skipif(
         not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"

From 48cda16c17adcb5cce5f8d29e985de0905a5348e Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 10:34:31 -0700
Subject: [PATCH 10/15] fix import

---
 megatron/core/inference/engines/dynamic_engine.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
index 36064523c07..f520cc660d3 100644
--- a/megatron/core/inference/engines/dynamic_engine.py
+++ b/megatron/core/inference/engines/dynamic_engine.py
@@ -44,7 +44,7 @@
 )
 from megatron.core.inference.utils import Counter, InferenceMode, await_process_call
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.cuda_graphs import delete_cuda_graphs
+from megatron.core.transformer.cuda_graphs import CudaGraphManager, delete_cuda_graphs
 from megatron.core.transformer.enums import InferenceCudaGraphScope
 from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
 from megatron.core.utils import (
@@ -469,8 +469,7 @@ def create_cuda_graphs(self, reset_context: bool = True):
             # NCCL workspaces, etc.) that pollutes `torch.cuda.memory_stats()`.
             pool_reserved, pool_alloc = _cuda_graph_mempool_bytes()
             logging.info(
-                "  [graph %d/%d] %s | pool reserved=%s (Δiter=%s) "
-                "pool allocated=%s (Δiter=%s)",
+                "  [graph %d/%d] %s | pool reserved=%s (Δiter=%s) " "pool allocated=%s (Δiter=%s)",
                 tbar_idx + 1,
                 len(context.cuda_graph_batch_dimensions_list),
                 cuda_graph_batch_dimension,
@@ -491,12 +490,8 @@ def create_cuda_graphs(self, reset_context: bool = True):
         final_pool_reserved, final_pool_alloc = _cuda_graph_mempool_bytes()
         capture_stats = {
             "time": time_end - time_start,
-            "allocated_bytes": (
-                mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc
-            ),
-            "reserved_bytes": (
-                mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved
-            ),
+            "allocated_bytes": (mem_stats_end["allocated_bytes.all.current"] - start_proc_alloc),
+            "reserved_bytes": (mem_stats_end["reserved_bytes.all.current"] - start_proc_reserved),
             "pool_reserved_bytes": final_pool_reserved,
             "pool_allocated_bytes": final_pool_alloc,
         }

From 9488712074c86b9f4ba394f5155626b6a19cc698 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 10:54:24 -0700
Subject: [PATCH 11/15] minor cleanup

---
 megatron/core/inference/engines/dynamic_engine.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
index f520cc660d3..8d955c0b001 100644
--- a/megatron/core/inference/engines/dynamic_engine.py
+++ b/megatron/core/inference/engines/dynamic_engine.py
@@ -376,12 +376,12 @@ def create_cuda_graphs(self, reset_context: bool = True):
         time_start = time.time()
         torch.cuda.reset_peak_memory_stats()
         mem_stats_start = torch.cuda.memory_stats()
-        # Snapshot of process-wide stats for the "total memory used by capture"
-        # summary printed at the end of the loop.
+
+        # Snapshot of process-wide stats for the "total memory used by capture" summary.
         start_proc_reserved = mem_stats_start["reserved_bytes.all.current"]
         start_proc_alloc = mem_stats_start["allocated_bytes.all.current"]
-        # Pool-scoped baselines for the per-iteration deltas. These isolate
-        # actual CUDA-graph-mempool growth from unrelated scratch churn.
+
+        # Pool-scoped baselines for the per-iteration deltas.
         prev_pool_reserved, prev_pool_alloc = _cuda_graph_mempool_bytes()
 
         logging.info("> dynamic_engine.py: building cuda graphs for ")

From 661179aa518b72f688100a082aac0a8a7013f274 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 12:35:34 -0700
Subject: [PATCH 12/15] fix failing test

---
 .../inference/contexts/test_dynamic_context.py         | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py
index 6a4838f2c46..978460f2ca9 100644
--- a/tests/unit_tests/inference/contexts/test_dynamic_context.py
+++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py
@@ -1602,10 +1602,16 @@ def test_add_dummy_requests_for_expert_parallel_step_matches_slow_path(
             num_speculative_tokens=num_speculative_tokens,
         )
 
-        smallest = min(ctx.cuda_graph_batch_dimensions_list)
+        # The fast path is decode-only by construction, so pick the smallest decode-only batch_dim.
+        # With the geometric grid for mixed cudagraphs, the global min may now be a P=1 mixed shape
+        # when num_speculative_tokens > 0 makes decode-only token_count > 1)
+        smallest = min(
+            batchdim
+            for batchdim in ctx.cuda_graph_batch_dimensions_list
+            if batchdim.prefill_req_count == 0
+        )
         N = smallest.decode_req_count
         T = smallest.token_count  # N * (num_speculative_tokens + 1)
-        assert smallest.prefill_req_count == 0, "smallest graph must be decode-only"
 
         # --- slow path (reference) ---
         ctx.add_dummy_requests_for_cudagraph_capture(smallest)

From b3472940d7050f25b739630dea6ab5d6f53e5a2d Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 13:02:56 -0700
Subject: [PATCH 13/15] fix test

---
 .../core/inference/batch_dimensions_utils.py   | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 1ecdfa87d07..e3e8264b3b0 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -476,6 +476,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                             max_requests - prefill_req_count,
                         ),
                     )
+                    # Skip pure-prefill shapes where each prefill request has <= 1 token. The model
+                    # has no prompt to attend over so the graph isn't useful and it triggers a
+                    # vectorized_gather_kernel OOB at capture time when token_count == 1.
+                    if decode_req_count == 0 and size <= prefill_req_count:
+                        continue
                     add_if_valid(
                         token_count=size,
                         prefill_req_count=prefill_req_count,
@@ -485,11 +490,14 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 # considering the one decode token is used for prefill request construction
                 prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1)))
                 if prefill_only_minimal_num < max_requests:
-                    add_if_valid(
-                        token_count=size,
-                        prefill_req_count=max(prefill_only_minimal_num, min(max_requests, size)),
-                        decode_req_count=0,
-                    )
+                    prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size))
+                    # Do not add invalid cases (see above for note on prefill shapes).
+                    if size > prefill_req_count:
+                        add_if_valid(
+                            token_count=size,
+                            prefill_req_count=prefill_req_count,
+                            decode_req_count=0,
+                        )
 
             # Create decode-only dimensions with optimized token counts
             for size in cuda_graph_decode_token_counts:

From 33e65c404256b75252b2b26bc9be289ede649d5c Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Mon, 18 May 2026 13:27:31 -0700
Subject: [PATCH 14/15] fix tests and recover small graphs for perf

---
 .../core/inference/batch_dimensions_utils.py  | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index e3e8264b3b0..e54b20e2418 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -476,10 +476,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                             max_requests - prefill_req_count,
                         ),
                     )
-                    # Skip pure-prefill shapes where each prefill request has <= 1 token. The model
-                    # has no prompt to attend over so the graph isn't useful and it triggers a
-                    # vectorized_gather_kernel OOB at capture time when token_count == 1.
-                    if decode_req_count == 0 and size <= prefill_req_count:
+                    # Skip token_count == 1 with prefill_req == 1: the gather kernel asserts
+                    # on index >= 1 against a 1-element tensor at capture time. Larger
+                    # `(size, size, 0)` shapes (each prefill = 1 token, total batch >= 2) are
+                    # fine because the gather has multiple indices to read.
+                    if size < 2:
                         continue
                     add_if_valid(
                         token_count=size,
@@ -489,15 +490,11 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 # We need to ensure the prefill requests are shorter than the max sequence length,
                 # considering the one decode token is used for prefill request construction
                 prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1)))
-                if prefill_only_minimal_num < max_requests:
+                if prefill_only_minimal_num < max_requests and size >= 2:
                     prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size))
-                    # Do not add invalid cases (see above for note on prefill shapes).
-                    if size > prefill_req_count:
-                        add_if_valid(
-                            token_count=size,
-                            prefill_req_count=prefill_req_count,
-                            decode_req_count=0,
-                        )
+                    add_if_valid(
+                        token_count=size, prefill_req_count=prefill_req_count, decode_req_count=0
+                    )
 
             # Create decode-only dimensions with optimized token counts
             for size in cuda_graph_decode_token_counts:

From 8561960aedf3080031ca6b876d015c54ec1599d4 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Tue, 19 May 2026 16:31:01 -0700
Subject: [PATCH 15/15] keshav fixes

---
 .../core/inference/batch_dimensions_utils.py  | 178 +++++++++++++-----
 megatron/core/inference/config.py             |  28 ++-
 .../inference/contexts/dynamic_context.py     |   1 +
 .../core/inference/engines/dynamic_engine.py  |   4 +-
 megatron/inference/utils.py                   |   4 +
 megatron/training/arguments.py                |   8 +
 6 files changed, 175 insertions(+), 48 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index e54b20e2418..1229d333d0a 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -217,34 +217,59 @@ class CUDAGraphBatchDimensionBuilder:
 
     @staticmethod
     def _calculate_cuda_graph_token_counts(
-        tp_size: int, num_cuda_graphs: int, cuda_graph_max_tokens: int
+        tp_size: int,
+        num_cuda_graphs: int,
+        cuda_graph_max_tokens: int,
+        sizing_distribution: "CudaGraphSizingDistribution" = None,
     ) -> List[int]:
         """
         Calculate CUDA graph token counts for a given configuration.
 
-        This method computes exponentially-decreasing token counts (powers of 2)
-        from cuda_graph_max_tokens down to CUDA_GRAPH_ROUNDER, ensuring proper
-        rounding and TP alignment.
+        Dispatches on `sizing_distribution`:
+          - EXPONENTIAL (default): halves from cuda_graph_max_tokens down to tp_size, log-spaced,
+            creates log2(max_tokens) graphs.
+          - LINEAR: small graphs [1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16);
+            explicit-N path uses even 16-stride from 0 to max.
 
         Args:
             tp_size: Tensor parallel size (for alignment)
-            num_cuda_graphs: Number of CUDA graphs to generate (must be >= 1)
+            num_cuda_graphs: Number of CUDA graphs to generate (must be >= 1, or -1 to auto-size)
             cuda_graph_max_tokens: Maximum token count for CUDA graphs (must be > 0)
+            sizing_distribution: Distribution of cudagraph sizes. Defaults to EXPONENTIAL.
 
         Returns:
             List of token counts in descending order
 
-        Example:
-            >>> _calculate_cuda_graph_token_counts
-            (tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128)
+        Example (EXPONENTIAL):
+            >>> _calculate_cuda_graph_token_counts(tp_size=1, num_cuda_graphs=8,
+            cuda_graph_max_tokens=128)
             [128, 64, 32, 16, 8, 4, 2, 1]
         """
+        from megatron.core.inference.config import CudaGraphSizingDistribution
+
+        if sizing_distribution is None:
+            sizing_distribution = CudaGraphSizingDistribution.EXPONENTIAL
+
+        if sizing_distribution == CudaGraphSizingDistribution.LINEAR:
+            return CUDAGraphBatchDimensionBuilder._calculate_token_counts_linear(
+                tp_size, num_cuda_graphs, cuda_graph_max_tokens
+            )
+
+        # Default path: exponential decay.
         if num_cuda_graphs == -1:
-            # Each step in the exponential-decay loop below halves the cudagraph size, so we need
-            # ~log2(max_tokens) steps with an extra +2 to leave headroom for dedup/trim.
-            auto_n = max(4, int(math.log2(max(2, cuda_graph_max_tokens))) + 2)
+            # Pick a graph count: we halve from cuda_graph_max_tokens down to 1, so
+            # log2(max_tokens) halvings are needed. Add a small margin for the two forced endpoints
+            # (cuda_graph_max_tokens and tp_size) that are unioned into the set after the loop.
+            # Floor at MIN_GRAPHS so the trim logic always has at least 2 entries to work with.
+            HEADROOM = 2
+            MIN_GRAPHS = 4
+            num_halvings = int(math.log2(max(2, cuda_graph_max_tokens)))
+            auto_n = max(MIN_GRAPHS, num_halvings + HEADROOM)
             return CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
-                tp_size=tp_size, num_cuda_graphs=auto_n, cuda_graph_max_tokens=cuda_graph_max_tokens
+                tp_size=tp_size,
+                num_cuda_graphs=auto_n,
+                cuda_graph_max_tokens=cuda_graph_max_tokens,
+                sizing_distribution=sizing_distribution,
             )
 
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
@@ -270,30 +295,28 @@ def _calculate_cuda_graph_token_counts(
         if num_cuda_graphs == 1:
             return [cuda_graph_max_tokens]
 
-        # Exponentially decreasing, stops after num_cuda_graphs entries
-        # or when below the minimum size.
-        cuda_graph_token_counts = []
+        # Exponentially decreasing token counts: halve from max_tokens until below the rounder floor
+        # or num_cuda_graphs. Dedupe (the rounding/TP-alignment can collide for small values),
+        # then sort descending.
+        sizes = set()
         val = cuda_graph_max_tokens
         for _ in range(num_cuda_graphs):
             # Round down to multiple of rounder, then up to multiple of TP size
             rounded = max(rounder, (val // rounder) * rounder)
             rounded = math.ceil(rounded / tp_size) * tp_size
-            if rounded not in cuda_graph_token_counts:
-                cuda_graph_token_counts.append(rounded)
+            sizes.add(rounded)
             val //= 2
             if val < 1:
                 break
 
-        # Ensure cuda_graph_max_tokens is always included
-        if cuda_graph_token_counts[0] != cuda_graph_max_tokens:
-            cuda_graph_token_counts.insert(0, cuda_graph_max_tokens)
+        # Always include the endpoints: cuda_graph_max_tokens (largest) and tp_size (smallest).
+        sizes.add(cuda_graph_max_tokens)
+        sizes.add(tp_size)
 
-        # Include a (possibly extra) size-1 graph
-        if cuda_graph_token_counts[-1] != tp_size:
-            cuda_graph_token_counts.append(tp_size)
+        cuda_graph_token_counts = sorted(sizes, reverse=True)
 
-        # Trim from the middle if we exceed num_cuda_graphs requested by the user
-        #  Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements.
+        # Trim from the middle if we exceed num_cuda_graphs requested by the user.
+        # Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements.
         while len(cuda_graph_token_counts) > num_cuda_graphs:
             cuda_graph_token_counts.pop(-2)
 
@@ -302,6 +325,51 @@ def _calculate_cuda_graph_token_counts(
 
         return cuda_graph_token_counts
 
+    @staticmethod
+    def _calculate_token_counts_linear(
+        tp_size: int, num_cuda_graphs: int, cuda_graph_max_tokens: int
+    ) -> List[int]:
+        """Linear-stride token count distribution.
+
+        For num_cuda_graphs == -1, returns [1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16)
+        TP-aligned and deduped.
+        For positive N, returns evenly-spaced sizes with step ~ max_tokens / N.
+        """
+        rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER
+
+        if num_cuda_graphs == -1:
+            sizes = (
+                [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16))
+            )
+            # TP-align and dedupe in order; preserve original ordering for parity.
+            sizes = list(dict.fromkeys(round_up_to_nearest_multiple(s, tp_size) for s in sizes))
+            sizes = [s for s in sizes if s <= cuda_graph_max_tokens]
+            if not sizes or sizes[-1] != cuda_graph_max_tokens:
+                sizes.append(cuda_graph_max_tokens)
+            sizes.reverse()
+            return sizes
+
+        assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
+        assert (
+            cuda_graph_max_tokens > 0
+        ), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}"
+
+        # Even stride: step = round_up_to(max / N, rounder), TP-aligned.
+        step = cuda_graph_max_tokens / num_cuda_graphs
+        step = rounder * int(math.ceil(int(step) / rounder))
+        step = round_up_to_nearest_multiple(step, tp_size)
+        step = max(step, tp_size)
+        cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size
+
+        if num_cuda_graphs == 1:
+            return [cuda_graph_max_tokens]
+
+        sizes = list(range(step, cuda_graph_max_tokens, step))
+        if not sizes or sizes[-1] != cuda_graph_max_tokens:
+            sizes.append(cuda_graph_max_tokens)
+        sizes.reverse()
+        return sizes
+
     @staticmethod
     def generate_cuda_graph_batch_dimensions_list(
         tp_size: int,
@@ -313,6 +381,7 @@ def generate_cuda_graph_batch_dimensions_list(
         max_sequence_length: int,
         use_cuda_graphs_for_non_decode_steps: bool,
         num_speculative_tokens: int = 0,
+        sizing_distribution: "CudaGraphSizingDistribution" = None,
     ) -> Tuple[List[InferenceBatchDimensions], Optional[List[int]]]:
         """
         Generate CUDA graph batch dimensions.
@@ -370,6 +439,12 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
         cuda_graph_decode_token_counts = None
         if num_cuda_graphs is not None:
 
+            # Lazy import to avoid a circular dependency with config.py.
+            from megatron.core.inference.config import CudaGraphSizingDistribution
+
+            if sizing_distribution is None:
+                sizing_distribution = CudaGraphSizingDistribution.EXPONENTIAL
+
             # Ensure valid num_cuda_graphs.
             if (
                 cuda_graph_max_tokens is None
@@ -396,6 +471,7 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                     tp_size=tp_size,
                     num_cuda_graphs=num_cuda_graphs,
                     cuda_graph_max_tokens=cuda_graph_max_tokens,
+                    sizing_distribution=sizing_distribution,
                 )
             )
 
@@ -408,12 +484,19 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                     tp_size=tp_size,
                     num_cuda_graphs=num_cuda_graphs,
                     cuda_graph_max_tokens=cuda_graph_max_tokens_decode,
+                    sizing_distribution=sizing_distribution,
                 )
             )
 
             # Include the smallest decode-only graphs when auto-sizing (num_cuda_graphs == -1).
-            # Without this, TP alignment (size 1 -> tp_size) and the num_speculative_tokens floor
-            # division may drop the size 1 and size 2 graph sizes.
+            # Without this, TP alignment and the num_speculative_tokens floor division can drop
+            # the smallest 1- and 2-request shapes from the captured set.
+            #
+            # The minimum valid decode token_count is lcm(spec_unit, tp_size):
+            #   - Ensure divisible by tp_size (required so TP / sequence-parallel never produces a
+            #     single-token graph when tp_size > 1).
+            #   - Ensure a multiple of (spec+1) so it accommodates an integer number of decode
+            #     requests when speculative decoding is enabled.
             if num_cuda_graphs == -1:
                 spec_unit = num_speculative_tokens + 1
                 min_decode_tokens = math.lcm(spec_unit, tp_size)
@@ -447,23 +530,28 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
         else:
             # Mixed prefill and decode mode.
             #
-            # Generate mixed CGs across a geometric P-grid rather than a single fixed P
-            # value. A captured graph at P=k bakes in a token layout for k prefill slots;
-            # a real batch with P != k pays for unused slot padding, which can overshoot
-            # the captured token_count and make the graph unusable. Geometric spacing
-            # bounds the relative overhead per real batch (worst case ~2x P slack) while
-            # keeping the total CG count log-bounded.
+            # Under EXPONENTIAL distribution (default): generate mixed CGs across a
+            # geometric P-grid {1, 2, 4, ..., max_requests}. This bounds the relative
+            # overhead per real batch (~2x P slack worst case) and is the structural fix
+            # that makes mixed CGs usable for real batches with P != fixed_P.
             #
-            # `cuda_graph_mixed_prefill_request_count` is now used only as an on/off
-            # toggle for mixed CGs (>0 enables, <=0 routes to decode-only above). The
-            # P value it used to specify is superseded by the grid.
-            p_values = []
-            p = 1
-            while p < max_requests:
-                p_values.append(p)
-                p *= 2
-            if not p_values or p_values[-1] != max_requests:
-                p_values.append(max_requests)
+            # Under LINEAR distribution: use the legacy fixed P value
+            # (cuda_graph_mixed_prefill_request_count) — same single-P behavior main has
+            # today, for apples-to-apples benchmarking against vLLM-style configurations.
+            if sizing_distribution == CudaGraphSizingDistribution.LINEAR:
+                p_values = [min(cuda_graph_mixed_prefill_request_count, max_requests)]
+                # In legacy mode, the prefill-only floor uses the fixed P value to match
+                # main's behavior exactly.
+                prefill_only_floor = cuda_graph_mixed_prefill_request_count
+            else:
+                p_values = []
+                p = 1
+                while p < max_requests:
+                    p_values.append(p)
+                    p *= 2
+                if not p_values or p_values[-1] != max_requests:
+                    p_values.append(max_requests)
+                prefill_only_floor = 1
 
             # Create prefill and mixed dimensions with full token counts
             for size in cuda_graph_prefill_token_counts:
@@ -489,7 +577,9 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                     )
                 # We need to ensure the prefill requests are shorter than the max sequence length,
                 # considering the one decode token is used for prefill request construction
-                prefill_only_minimal_num = max(1, math.ceil(size / max(1, max_sequence_length - 1)))
+                prefill_only_minimal_num = max(
+                    prefill_only_floor, math.ceil(size / max(1, max_sequence_length - 1))
+                )
                 if prefill_only_minimal_num < max_requests and size >= 2:
                     prefill_req_count = max(prefill_only_minimal_num, min(max_requests, size))
                     add_if_valid(
diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py
index e8769f3d6e7..ea4b08e5183 100644
--- a/megatron/core/inference/config.py
+++ b/megatron/core/inference/config.py
@@ -117,6 +117,22 @@ class KVCacheManagementMode(str, Enum):
     """Deallocate large tensors and recompute them from scratch during allocation."""
 
 
+class CudaGraphSizingDistribution(str, Enum):
+    """How CUDA graph token-count sizes are spaced when generating the captured graphs.
+
+    EXPONENTIAL (default) — token counts halve from `cuda_graph_max_tokens` down to `tp_size`,
+    giving a log-spaced distribution. Bounded relative padding (~2x worst case) at every scale and
+    `log2(max_tokens)` total graphs.
+
+    LINEAR — Include size-1 and size-2 graphs where applicable, linear spacing up until 256, and
+    sparser linear spacing past 256. e.g. `[1, 2, 4] + range(8, 256, 8) + range(256, max+1, 16)`.
+    Higher graph density at the top end.
+    """
+
+    EXPONENTIAL = "exponential"
+    LINEAR = "linear"
+
+
 @dataclass
 class InferenceConfig:
     """
@@ -197,10 +213,20 @@ class InferenceConfig:
     """
 
     cuda_graph_mixed_prefill_count: Optional[int] = 16
-    """ 
+    """
     The number of mixed prefill graphs to capture if mixed prefill/decode graphs are enabled.
     """
 
+    cuda_graph_sizing_distribution: CudaGraphSizingDistribution = (
+        CudaGraphSizingDistribution.EXPONENTIAL
+    )
+    """
+    How CUDA graph token counts are spaced. EXPONENTIAL (default) halves from
+    `cuda_graph_max_tokens` down to `tp_size` (log-spaced, ~log2(max_tokens) graphs).
+    LINEAR uses a range of linear strides (includes small graphs + mid-range linearity + 
+    a bigger step size at the top end).
+    """
+
     use_cuda_graphs_for_non_decode_steps: bool = True
     """
     Whether to use CUDA graphs for non-decode steps.
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
index 6c2dcb47340..8fe02158b1b 100644
--- a/megatron/core/inference/contexts/dynamic_context.py
+++ b/megatron/core/inference/contexts/dynamic_context.py
@@ -644,6 +644,7 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC
                 max_sequence_length=self.max_sequence_length,
                 use_cuda_graphs_for_non_decode_steps=self.use_cuda_graphs_for_non_decode_steps,
                 num_speculative_tokens=self.num_speculative_tokens,
+                sizing_distribution=inference_config.cuda_graph_sizing_distribution,
             )
         )
 
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
index 8d955c0b001..8a43eb0f7ae 100644
--- a/megatron/core/inference/engines/dynamic_engine.py
+++ b/megatron/core/inference/engines/dynamic_engine.py
@@ -142,7 +142,7 @@ def format_mem_bytes(mem_bytes):
     return "%d bytes" % mem_bytes
 
 
-def _cuda_graph_mempool_bytes():
+def _cuda_graph_mempool_bytes() -> Tuple[int, int]:
     """Return (reserved, allocated) bytes belonging to the global CUDA graph mempool.
 
     PyTorch's `torch.cuda.memory_stats()` reports process-wide totals that mix in
@@ -374,7 +374,6 @@ def create_cuda_graphs(self, reset_context: bool = True):
         controller = self.controller
 
         time_start = time.time()
-        torch.cuda.reset_peak_memory_stats()
         mem_stats_start = torch.cuda.memory_stats()
 
         # Snapshot of process-wide stats for the "total memory used by capture" summary.
@@ -479,7 +478,6 @@ def create_cuda_graphs(self, reset_context: bool = True):
                 format_mem_bytes(pool_alloc - prev_pool_alloc),
             )
             prev_pool_reserved, prev_pool_alloc = pool_reserved, pool_alloc
-            torch.cuda.reset_peak_memory_stats()
 
         if mtp_warmup_enabled and mtp_seen_batch_sizes:
             logging.info("> MTP CUDA graph warmup: %d batch size(s)", len(mtp_seen_batch_sizes))
diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py
index 3f06eb8a301..60b6d9bb0c0 100644
--- a/megatron/inference/utils.py
+++ b/megatron/inference/utils.py
@@ -9,6 +9,7 @@
 from gpt_builders import gpt_builder
 from hybrid_builders import hybrid_builder
 from megatron.core.inference.config import (
+    CudaGraphSizingDistribution,
     InferenceConfig,
     KVCacheManagementMode,
     MambaInferenceStateConfig,
@@ -356,6 +357,9 @@ def get_inference_config_from_model_and_args(model: MegatronModule, args):
         unified_memory_level=args.inference_dynamic_batching_unified_memory_level,
         kv_cache_management_mode=KVCacheManagementMode(args.rl_kv_cache_management_mode),
         cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count,  # pylint: disable=line-too-long
+        cuda_graph_sizing_distribution=CudaGraphSizingDistribution(
+            args.inference_dynamic_batching_cuda_graph_sizing_distribution
+        ),
         use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs,
         cuda_graph_all_prefills=args.inference_cuda_graph_all_prefills,
         static_kv_memory_pointers=args.rl_persist_cuda_graphs,
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index f3c2ded6907..948f8091814 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1963,6 +1963,14 @@ def _add_inference_args(parser):
     group.add_argument('--inference-dynamic-batching-cuda-graph-mixed-prefill-count',
                        type=int, default=16,
                        help='Number of mixed prefill requests to capture in a cuda graph.')
+    group.add_argument('--inference-dynamic-batching-cuda-graph-sizing-distribution',
+                       type=str, default='exponential',
+                       choices=['exponential', 'linear'],
+                       dest='inference_dynamic_batching_cuda_graph_sizing_distribution',
+                       help='Spacing of CUDA graph token counts. "exponential" (default) '
+                            'halves from cuda_graph_max_tokens down to tp_size, giving a '
+                            'log-spaced distribution with bounded relative padding. '
+                            '"linear" uses varying linear strides across the range.')
     group.add_argument('--inference-dynamic-batching-sampling-backend',
                        type=str, default='torch',
                        choices=['torch', 'flashinfer'],