yashaswikarnati · yashaswikarnati · May 12, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
@@ -1,8 +1,4 @@
 [
-    {
-        "user": "janEbert",
-        "date": "2026-05-06"
-    },
     {
         "user": "dimapihtar",
         "date": "2026-05-13"
@@ -46,5 +42,9 @@
     {
         "user": "Phlip79",
         "date": "2026-07-22"
+    },
+    {
+        "user": "YangFei1990",
+        "date": "2026-07-29"
     }
 ]
diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
@@ -165,6 +165,8 @@ def __init__(
                     param_indices == layout.param_indices
                 ), f"param_indices for {buffer_key} do not match between grouping and layout"
 
+        self.full_param_layout = full_param_layout
+
         # Compute gradient scaling factors.
         if config.calculate_per_token_loss:
             assert (

diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
@@ -208,6 +208,16 @@ def __init__(
                 not self.ddp_config.reduce_scatter_with_fp32_accumulation
             ), "RS w/ FP32 accumulation not supported with num_distributed_optimizer_instances > 1"
 
+        reduction_collective = (
+            "reduce-scatter" if self.ddp_config.use_distributed_optimizer else "all-reduce"
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Using {reduction_collective} for gradient reductions because "
+            f"{self.ddp_config.use_distributed_optimizer=}",
+        )
+
         global dist_reduce_scatter_func
         if self.ddp_config.reduce_scatter_with_fp32_accumulation:
             dist_reduce_scatter_func = reduce_scatter_with_fp32_accumulation
@@ -322,8 +332,9 @@ def start_param_sync(self, force_sync: bool = False):
         async_op = self.ddp_config.overlap_param_gather and not force_sync
 
         if not self.ddp_config.use_distributed_optimizer:
-            # Layer-wise optimizer path: use all_gather for variable-size
-            # param gather.
+            # Legacy layer-wise optimizer path: use all_gather for variable-size
+            # param gather.  Once all layerwise call sites set
+            # ddp_config.use_distributed_optimizer=True, this branch can be removed.
             #
             # Each rank may own a different number of params per bucket, so
             # layerwise_param_flat_sizes can vary across ranks.  PyTorch's NCCL

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
@@ -271,6 +271,8 @@ def _calculate_cuda_graph_token_counts(
         )
         # Make sure divisible by TP size
         cuda_graph_step_size = round_up_to_nearest_multiple(cuda_graph_step_size, tp_size)
+        # Ensure non-zero step size (can happen when max_tokens < num_cuda_graphs).
+        cuda_graph_step_size = max(cuda_graph_step_size, tp_size)
 
         # round down cuda graph max tokens to be multiple of TP size
         cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size
@@ -367,11 +369,9 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
             ):
                 cuda_graph_max_tokens = max_tokens
 
-            assert cuda_graph_max_tokens == max_requests * (num_speculative_tokens + 1), (
-                f"cuda_graph_max_tokens ({cuda_graph_max_tokens}) must equal max_requests *"
-                f"(num_speculative_tokens + 1) ({max_requests * (num_speculative_tokens + 1)}). "
-                "This is required for correctly syncing EP ranks: "
-                f"prefill and decode graph pools must have the same token count granularity."
+            assert cuda_graph_max_tokens >= max_requests * (num_speculative_tokens + 1), (
+                f"cuda_graph_max_tokens ({cuda_graph_max_tokens}) must be >= max_requests * "
+                f"(num_speculative_tokens + 1) ({max_requests * (num_speculative_tokens + 1)})."
             )
 
             if num_cuda_graphs != -1:

diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py
@@ -188,8 +188,12 @@ class InferenceConfig:
     # =================================
     num_cuda_graphs: Optional[int] = None
     """
-    Maximum number of cuda graphs to capture, where the cuda graph batch sizes range from 1 to
-    `max_requests`. Due to rounding, the actual number of cuda graphs may not equal this argument.
+    Maximum number of cuda graphs to capture.
+    Graph token counts are spaced from 1 up to a per-graph-type budget:
+      - Decode-only graphs are always bounded by `max_requests * (num_speculative_tokens + 1)`.
+      - Prefill/mixed graphs share that same bound by default,
+        or extend up to `max_tokens` when `cuda_graph_all_prefills` is set.
+    Due to rounding, the actual number of cuda graphs may not equal this argument.
     """
 
     cuda_graph_mixed_prefill_count: Optional[int] = 16
@@ -202,6 +206,14 @@ class InferenceConfig:
     Whether to use CUDA graphs for non-decode steps.
     """
 
+    cuda_graph_all_prefills: bool = False
+    """
+    Whether prefill/mixed CUDA graphs should span up to `max_tokens`.
+    When False (default), prefill/mixed graphs are bounded by the same token limit as decode graphs:
+    `max_requests * (num_speculative_tokens + 1)`.
+    When True, prefill/mixed graph capture is extended to cover the full `max_tokens` budget.
+    """
+
     static_kv_memory_pointers: bool = False
     """
     Whether the KV cache (and Mamba states) will reside at the same memory addresses

diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
@@ -623,12 +623,21 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC
             and not (force_disable_non_decode_cuda_graphs)
         )
 
+        # CUDA graph token budget for prefill/mixed graphs. Decode graphs are always
+        # capped at max_requests * (num_speculative_tokens + 1) inside the helper; this
+        # only widens the prefill/mixed range when `cuda_graph_all_prefills` is set.
+        cuda_graph_max_tokens = (
+            self.max_tokens
+            if inference_config.cuda_graph_all_prefills
+            else self.max_requests * (self.num_speculative_tokens + 1)
+        )
+
         # CUDA graph config list.
         self.cuda_graph_batch_dimensions_list, self.cuda_graph_token_counts = (
             CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list(
                 tp_size=tp_size,
                 num_cuda_graphs=inference_config.num_cuda_graphs,
-                cuda_graph_max_tokens=self.max_requests * (self.num_speculative_tokens + 1),
+                cuda_graph_max_tokens=cuda_graph_max_tokens,
                 cuda_graph_mixed_prefill_request_count=inference_config.cuda_graph_mixed_prefill_count,
                 max_requests=self.max_requests,
                 max_tokens=self.max_tokens,