Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/oncall_schedule.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
[
{
"user": "janEbert",
"date": "2026-05-06"
},
{
"user": "dimapihtar",
"date": "2026-05-13"
Expand Down Expand Up @@ -46,5 +42,9 @@
{
"user": "Phlip79",
"date": "2026-07-22"
},
{
"user": "YangFei1990",
"date": "2026-07-29"
}
]
2 changes: 2 additions & 0 deletions megatron/core/distributed/distributed_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ def __init__(
param_indices == layout.param_indices
), f"param_indices for {buffer_key} do not match between grouping and layout"

self.full_param_layout = full_param_layout

# Compute gradient scaling factors.
if config.calculate_per_token_loss:
assert (
Expand Down
15 changes: 13 additions & 2 deletions megatron/core/distributed/param_and_grad_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ def __init__(
not self.ddp_config.reduce_scatter_with_fp32_accumulation
), "RS w/ FP32 accumulation not supported with num_distributed_optimizer_instances > 1"

reduction_collective = (
"reduce-scatter" if self.ddp_config.use_distributed_optimizer else "all-reduce"
)
log_single_rank(
logger,
logging.INFO,
f"Using {reduction_collective} for gradient reductions because "
f"{self.ddp_config.use_distributed_optimizer=}",
)

global dist_reduce_scatter_func
if self.ddp_config.reduce_scatter_with_fp32_accumulation:
dist_reduce_scatter_func = reduce_scatter_with_fp32_accumulation
Expand Down Expand Up @@ -322,8 +332,9 @@ def start_param_sync(self, force_sync: bool = False):
async_op = self.ddp_config.overlap_param_gather and not force_sync

if not self.ddp_config.use_distributed_optimizer:
# Layer-wise optimizer path: use all_gather for variable-size
# param gather.
# Legacy layer-wise optimizer path: use all_gather for variable-size
# param gather. Once all layerwise call sites set
# ddp_config.use_distributed_optimizer=True, this branch can be removed.
#
# Each rank may own a different number of params per bucket, so
# layerwise_param_flat_sizes can vary across ranks. PyTorch's NCCL
Expand Down
10 changes: 5 additions & 5 deletions megatron/core/inference/batch_dimensions_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,8 @@ def _calculate_cuda_graph_token_counts(
)
# Make sure divisible by TP size
cuda_graph_step_size = round_up_to_nearest_multiple(cuda_graph_step_size, tp_size)
# Ensure non-zero step size (can happen when max_tokens < num_cuda_graphs).
cuda_graph_step_size = max(cuda_graph_step_size, tp_size)

# round down cuda graph max tokens to be multiple of TP size
cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size
Expand Down Expand Up @@ -367,11 +369,9 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
):
cuda_graph_max_tokens = max_tokens

assert cuda_graph_max_tokens == max_requests * (num_speculative_tokens + 1), (
f"cuda_graph_max_tokens ({cuda_graph_max_tokens}) must equal max_requests *"
f"(num_speculative_tokens + 1) ({max_requests * (num_speculative_tokens + 1)}). "
"This is required for correctly syncing EP ranks: "
f"prefill and decode graph pools must have the same token count granularity."
assert cuda_graph_max_tokens >= max_requests * (num_speculative_tokens + 1), (
f"cuda_graph_max_tokens ({cuda_graph_max_tokens}) must be >= max_requests * "
f"(num_speculative_tokens + 1) ({max_requests * (num_speculative_tokens + 1)})."
)

if num_cuda_graphs != -1:
Expand Down
16 changes: 14 additions & 2 deletions megatron/core/inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,12 @@ class InferenceConfig:
# =================================
num_cuda_graphs: Optional[int] = None
"""
Maximum number of cuda graphs to capture, where the cuda graph batch sizes range from 1 to
`max_requests`. Due to rounding, the actual number of cuda graphs may not equal this argument.
Maximum number of cuda graphs to capture.
Graph token counts are spaced from 1 up to a per-graph-type budget:
- Decode-only graphs are always bounded by `max_requests * (num_speculative_tokens + 1)`.
- Prefill/mixed graphs share that same bound by default,
or extend up to `max_tokens` when `cuda_graph_all_prefills` is set.
Due to rounding, the actual number of cuda graphs may not equal this argument.
"""

cuda_graph_mixed_prefill_count: Optional[int] = 16
Expand All @@ -202,6 +206,14 @@ class InferenceConfig:
Whether to use CUDA graphs for non-decode steps.
"""

cuda_graph_all_prefills: bool = False
"""
Whether prefill/mixed CUDA graphs should span up to `max_tokens`.
When False (default), prefill/mixed graphs are bounded by the same token limit as decode graphs:
`max_requests * (num_speculative_tokens + 1)`.
When True, prefill/mixed graph capture is extended to cover the full `max_tokens` budget.
"""

static_kv_memory_pointers: bool = False
"""
Whether the KV cache (and Mamba states) will reside at the same memory addresses
Expand Down
11 changes: 10 additions & 1 deletion megatron/core/inference/contexts/dynamic_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,12 +623,21 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC
and not (force_disable_non_decode_cuda_graphs)
)

# CUDA graph token budget for prefill/mixed graphs. Decode graphs are always
# capped at max_requests * (num_speculative_tokens + 1) inside the helper; this
# only widens the prefill/mixed range when `cuda_graph_all_prefills` is set.
cuda_graph_max_tokens = (
self.max_tokens
if inference_config.cuda_graph_all_prefills
else self.max_requests * (self.num_speculative_tokens + 1)
)

# CUDA graph config list.
self.cuda_graph_batch_dimensions_list, self.cuda_graph_token_counts = (
CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list(
tp_size=tp_size,
num_cuda_graphs=inference_config.num_cuda_graphs,
cuda_graph_max_tokens=self.max_requests * (self.num_speculative_tokens + 1),
cuda_graph_max_tokens=cuda_graph_max_tokens,
cuda_graph_mixed_prefill_request_count=inference_config.cuda_graph_mixed_prefill_count,
max_requests=self.max_requests,
max_tokens=self.max_tokens,
Expand Down
Loading