From 4cd9563a20f21814dd6af5766fb98486493c5ba9 Mon Sep 17 00:00:00 2001 From: HaochenYuan <106647990+HaochenYuan@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:54:13 +0800 Subject: [PATCH 001/231] Fix for PR-2142 (#3165) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Philip Petrakian Co-authored-by: oliver könig --- megatron/core/transformer/moe/moe_layer.py | 3 +- .../transformer/moe/test_moe_layer.py | 120 ++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 98d4d5fa505..990d13b98d9 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -437,11 +437,12 @@ def custom_forward(hidden_states, intermediate_tensors, padding_mask=None): tensor_parallel.random.get_cuda_rng_tracker, parallel_state.get_tensor_model_parallel_group(), hidden_states, + intermediate_tensors, padding_mask, ) else: outputs = tensor_parallel.checkpoint( - custom_forward, False, hidden_states, padding_mask + custom_forward, False, hidden_states, intermediate_tensors, padding_mask ) else: outputs = custom_forward(hidden_states, intermediate_tensors, padding_mask) diff --git a/tests/unit_tests/transformer/moe/test_moe_layer.py b/tests/unit_tests/transformer/moe/test_moe_layer.py index 2a2c995257e..11bd09f8449 100644 --- a/tests/unit_tests/transformer/moe/test_moe_layer.py +++ b/tests/unit_tests/transformer/moe/test_moe_layer.py @@ -276,3 +276,123 @@ def test_moe_layer_fp16_forward_backward( def teardown_method(self, method): Utils.destroy_model_parallel() + + +class TestMoELayerRecompute: + """Test MoE layer with recompute enabled (activation checkpointing). + + Tests both code paths: + - fp8=False: uses tensor_parallel.checkpoint + - fp8=True: uses te_checkpoint (requires TE >= 1.7.0) + """ + + def setup_method(self, method): + pass + + @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"]) + @pytest.mark.parametrize("num_moe_experts", [2, 4]) + @pytest.mark.parametrize("with_padding_mask", [True, False]) + @pytest.mark.parametrize("tp_size,ep_size", [(1, 1), (4, 2)]) + @pytest.mark.parametrize("fp8", [False, True]) + def test_moe_layer_recompute_forward_backward( + self, num_moe_experts, moe_token_dispatcher_type, with_padding_mask, tp_size, ep_size, fp8 + ): + """Test MoE layer forward and backward pass with recompute enabled. + + When fp8=False, uses tensor_parallel.checkpoint. + When fp8=True, uses te_checkpoint (requires TE >= 1.7.0). + """ + # Skip fp8 tests if TE version is not sufficient + if fp8 and not is_te_min_version("1.7.0.dev0"): + pytest.skip("FP8 MoE recompute requires TE 1.7.0 and later.") + + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, expert_model_parallel_size=ep_size + ) + _set_random_seed(seed_=123, data_parallel_random_init=False) + + hidden_size = 64 + sequence_length = 32 + micro_batch_size = 2 + + transformer_config = TransformerConfig( + num_layers=1, + hidden_size=hidden_size, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=False, + moe_token_dispatcher_type=moe_token_dispatcher_type, + moe_router_load_balancing_type="aux_loss", + moe_router_topk=2, + moe_aux_loss_coeff=0.01, + moe_grouped_gemm=False, + moe_ffn_hidden_size=256, + add_bias_linear=False, + # Enable recompute for MoE layer + recompute_granularity="selective", + recompute_modules=["moe"], + tensor_model_parallel_size=tp_size, + expert_model_parallel_size=ep_size, + sequence_parallel=tp_size > 1, + fp8=fp8, + bf16=True, + params_dtype=torch.bfloat16, + ) + + # Use TE spec for fp8, local spec otherwise + if fp8: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=False + ) + else: + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=num_moe_experts, moe_grouped_gemm=False + ) + + moe_layer = MoELayer( + transformer_config, transformer_layer_spec.submodules.mlp.submodules + ).cuda() + + hidden_states = torch.randn( + sequence_length, + micro_batch_size, + hidden_size, + device=torch.cuda.current_device(), + dtype=torch.bfloat16, + requires_grad=True, + ) + + # Create padding mask if needed: shape [batch_size, sequence_length] + padding_mask = None + if with_padding_mask: + padding_mask = torch.ones( + micro_batch_size, + sequence_length, + device=torch.cuda.current_device(), + dtype=torch.bool, + ) + # Mark last 4 tokens as padding for each batch + padding_mask[:, -4:] = False + + output, _ = moe_layer(hidden_states, padding_mask=padding_mask) + + assert output.dtype == torch.bfloat16, f"Expected bf16 output, got {output.dtype}" + assert output.shape == hidden_states.shape, f"Output shape mismatch" + + # Backward pass - this is where recompute/checkpoint is actually used + loss = output.sum() + loss.backward() + + assert hidden_states.grad is not None, "Input gradients should exist" + assert ( + hidden_states.grad.dtype == torch.bfloat16 + ), f"Expected bf16 gradients, got {hidden_states.grad.dtype}" + + for name, param in moe_layer.named_parameters(): + if param.requires_grad: + assert param.grad is not None, f"Gradient for {name} should exist" + + Utils.destroy_model_parallel() + + def teardown_method(self, method): + Utils.destroy_model_parallel() From 6de6362904806ed925122b97de6a9bf0e8e3c611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 30 Jan 2026 10:46:07 +0100 Subject: [PATCH 002/231] ci: Onboard more GB200 tests (#3145) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 +- .../golden_values_dev_dgx_gb200.json | 24 +- .../golden_values_dev_dgx_gb200.json | 24 +- .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 198 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 298 +++--- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 204 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 98 +- .../golden_values_dev_dgx_gb200.json | 962 ++++++++--------- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 300 +++--- .../golden_values_dev_dgx_gb200.json | 300 +++--- .../golden_values_dev_dgx_gb200.json | 484 ++++----- .../golden_values_dev_dgx_gb200.json | 300 +++--- .../golden_values_dev_dgx_gb200.json | 482 ++++----- .../golden_values_dev_dgx_gb200.json | 300 +++--- .../golden_values_dev_dgx_gb200.json | 480 ++++----- .../golden_values_dev_dgx_gb200.json | 484 ++++----- .../golden_values_dev_dgx_gb200.json | 484 ++++----- .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200_2nd.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 974 +++++++++--------- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 964 ++++++++--------- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 960 ++++++++--------- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 600 +++++------ .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 768 +++++++------- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 100 +- .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 200 ++-- .../golden_values_dev_dgx_gb200_2nd.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 344 +++++++ .../golden_values_dev_dgx_gb200.json | 287 ++++++ .../golden_values_dev_dgx_gb200.json | 537 ++++++++++ .../golden_values_dev_dgx_gb200.json | 644 ++++++++++++ .../python_scripts/recipe_parser.py | 28 +- .../{gpt-gb200.yaml => gb200/gpt.yaml} | 207 ++-- tests/test_utils/recipes/gb200/moe.yaml | 220 ++++ .../test_utils/recipes/gb200/unit-tests.yaml | 153 +++ tests/test_utils/recipes/{ => h100}/bert.yaml | 0 .../recipes/{ => h100}/ckpt_converter.yaml | 0 .../gpt-dynamic-inference-cuda-graphs.yaml | 0 ...pt-dynamic-inference-with-coordinator.yaml | 0 .../{ => h100}/gpt-dynamic-inference.yaml | 0 .../recipes/{ => h100}/gpt-grads.yaml | 0 .../recipes/{ => h100}/gpt-grpo.yaml | 0 .../recipes/{ => h100}/gpt-nemo.yaml | 0 .../{ => h100}/gpt-static-inference.yaml | 0 tests/test_utils/recipes/{ => h100}/gpt.yaml | 0 .../{ => h100}/mamba-dynamic-inference.yaml | 0 .../{ => h100}/mamba-static-inference.yaml | 0 .../test_utils/recipes/{ => h100}/mamba.yaml | 0 tests/test_utils/recipes/{ => h100}/mimo.yaml | 0 .../{ => h100}/module_performance.yaml | 0 ...oe-dynamic-inference-with-coordinator.yaml | 0 .../{ => h100}/moe-dynamic-inference.yaml | 0 .../recipes/{ => h100}/moe-grpo.yaml | 0 .../{ => h100}/moe-static-inference.yaml | 0 tests/test_utils/recipes/{ => h100}/moe.yaml | 0 .../recipes/{ => h100}/multimodal-llava.yaml | 0 tests/test_utils/recipes/{ => h100}/t5.yaml | 0 .../recipes/{ => h100}/unit-tests.yaml | 0 tests/unit_tests/find_test_cases.py | 3 +- tests/unit_tests/run_ci_test.sh | 2 +- 130 files changed, 31154 insertions(+), 7476 deletions(-) create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json rename tests/test_utils/recipes/{gpt-gb200.yaml => gb200/gpt.yaml} (68%) create mode 100644 tests/test_utils/recipes/gb200/moe.yaml create mode 100644 tests/test_utils/recipes/gb200/unit-tests.yaml rename tests/test_utils/recipes/{ => h100}/bert.yaml (100%) rename tests/test_utils/recipes/{ => h100}/ckpt_converter.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-dynamic-inference-cuda-graphs.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-dynamic-inference-with-coordinator.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-dynamic-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-grads.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-grpo.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-nemo.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt-static-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/gpt.yaml (100%) rename tests/test_utils/recipes/{ => h100}/mamba-dynamic-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/mamba-static-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/mamba.yaml (100%) rename tests/test_utils/recipes/{ => h100}/mimo.yaml (100%) rename tests/test_utils/recipes/{ => h100}/module_performance.yaml (100%) rename tests/test_utils/recipes/{ => h100}/moe-dynamic-inference-with-coordinator.yaml (100%) rename tests/test_utils/recipes/{ => h100}/moe-dynamic-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/moe-grpo.yaml (100%) rename tests/test_utils/recipes/{ => h100}/moe-static-inference.yaml (100%) rename tests/test_utils/recipes/{ => h100}/moe.yaml (100%) rename tests/test_utils/recipes/{ => h100}/multimodal-llava.yaml (100%) rename tests/test_utils/recipes/{ => h100}/t5.yaml (100%) rename tests/test_utils/recipes/{ => h100}/unit-tests.yaml (100%) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ad26a5ba0f6..16e2051e4e2 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -345,7 +345,7 @@ jobs: - name: Parse unit tests id: parse-unit-tests run: | - cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json + cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT cicd-unit-tests-latest: diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json index 4770792474b..0174aaf4684 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json @@ -133,29 +133,29 @@ "step_interval": 1, "values": { "1": "nan", - "2": 5.8109, + "2": 3.93489, "3": "nan", - "4": 0.8316, + "4": 0.83885, "5": "nan", - "6": 0.83072, + "6": 0.86101, "7": "nan", - "8": 0.82637, + "8": 0.82617, "9": "nan", - "10": 0.823, + "10": 0.8264, "11": "nan", - "12": 0.82386, + "12": 0.82456, "13": "nan", - "14": 0.82343, + "14": 0.82414, "15": "nan", - "16": 0.82487, + "16": 0.82604, "17": "nan", - "18": 0.82227, + "18": 0.83002, "19": "nan", - "20": 0.82121, + "20": 0.8234, "21": "nan", - "22": 0.82248, + "22": 0.82298, "23": "nan", - "24": 0.81939, + "24": 0.82311, "25": "nan" } } diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json index 5c13c9d624f..ca51cd1bcb3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json @@ -133,29 +133,29 @@ "step_interval": 1, "values": { "1": "nan", - "2": 3.75605, + "2": 2.88247, "3": "nan", - "4": 1.05448, + "4": 0.98359, "5": "nan", - "6": 1.24087, + "6": 0.91373, "7": "nan", - "8": 0.89299, + "8": 1.07044, "9": "nan", - "10": 0.89376, + "10": 0.91309, "11": "nan", - "12": 0.8965, + "12": 0.91579, "13": "nan", - "14": 0.89831, + "14": 0.90609, "15": "nan", - "16": 0.89733, + "16": 0.90906, "17": "nan", - "18": 1.02538, + "18": 0.91134, "19": "nan", - "20": 0.89305, + "20": 0.90623, "21": "nan", - "22": 0.89255, + "22": 0.91236, "23": "nan", - "24": 0.91075, + "24": 0.9145, "25": "nan" } } diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json index 7650494228d..bac18297ae6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5.51862, - "2": 0.11775, - "3": 0.11561, - "4": 0.1042, - "5": 0.10208, - "6": 0.09598, - "7": 0.09542, - "8": 0.095, - "9": 0.09404, - "10": 0.09596, - "11": 0.09825, - "12": 0.09507, - "13": 0.0943, - "14": 0.09595, - "15": 0.09454, - "16": 0.09354, - "17": 0.09423, - "18": 0.09638, - "19": 0.09698, - "20": 0.09656, - "21": 0.09629, - "22": 0.09731, - "23": 0.09913, - "24": 0.09535, - "25": 0.09314, - "26": 0.09324, - "27": 0.09374, - "28": 0.0992, - "29": 0.09647, - "30": 0.11416, - "31": 0.09524, - "32": 0.09418, - "33": 0.09544, - "34": 0.09428, - "35": 0.09432, - "36": 0.09584, - "37": 0.096, - "38": 0.09539, - "39": 0.09482, - "40": 0.09568, - "41": 0.09682, - "42": 0.0964, - "43": 0.09675, - "44": 0.09583, - "45": 0.09482, - "46": 0.09426, - "47": 0.09537, - "48": 0.09383, - "49": 0.09397, - "50": 0.09592 + "1": "nan", + "2": 2.46491, + "3": 0.11014, + "4": 0.09722, + "5": 0.09703, + "6": 0.09705, + "7": 0.09593, + "8": 0.09584, + "9": 0.09505, + "10": 0.0949, + "11": 0.09504, + "12": 0.09589, + "13": 0.09506, + "14": 0.09425, + "15": 0.09404, + "16": 0.09465, + "17": 0.09237, + "18": 0.09201, + "19": 0.09159, + "20": 0.09124, + "21": 0.09092, + "22": 0.09028, + "23": 0.08966, + "24": 0.08893, + "25": 0.09042, + "26": 0.09055, + "27": 0.08889, + "28": 0.08857, + "29": 0.0884, + "30": 0.08807, + "31": 0.08777, + "32": 0.08747, + "33": 0.0876, + "34": 0.08733, + "35": 0.0886, + "36": 0.08828, + "37": 0.08789, + "38": 0.08768, + "39": 0.08819, + "40": 0.08922, + "41": 0.08797, + "42": 0.0876, + "43": 0.0868, + "44": 0.08693, + "45": 0.08661, + "46": 0.08657, + "47": 0.08769, + "48": 0.08644, + "49": 0.08681, + "50": 0.08702 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json index 0405b9dc312..8bcd3aa91d5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5.87663, - "2": 0.11967, - "3": 0.10376, - "4": 0.09966, - "5": 0.0967, - "6": 0.09666, - "7": 0.09702, - "8": 0.09962, - "9": 0.10053, - "10": 0.10019, - "11": 0.09818, - "12": 0.37487, - "13": 0.10166, - "14": 0.10015, - "15": 0.10189, - "16": 0.09883, - "17": 0.10229, - "18": 0.09859, - "19": 0.09957, - "20": 0.09987, - "21": 0.09747, - "22": 0.09678, - "23": 0.09865, - "24": 0.09988, - "25": 0.11712, - "26": 0.11559, - "27": 0.11626, - "28": 0.11634, - "29": 0.11701, - "30": 0.13544, - "31": 0.13258, - "32": 0.12643, - "33": 0.12858, - "34": 0.18682, - "35": 0.12702, - "36": 0.09639, - "37": 0.09478, - "38": 0.09349, - "39": 0.09417, - "40": 0.09272, - "41": 0.09563, - "42": 0.09369, - "43": 0.09427, - "44": 0.09501, - "45": 0.09141, - "46": 0.09367, - "47": 0.0929, - "48": 0.09322, - "49": 0.09223, - "50": 0.0936 + "1": "nan", + "2": 2.34829, + "3": 0.10511, + "4": 0.09797, + "5": 0.09705, + "6": 0.09665, + "7": 0.09616, + "8": 0.09616, + "9": 0.0968, + "10": 0.09708, + "11": 0.09598, + "12": 0.09533, + "13": 0.09549, + "14": 0.09665, + "15": 0.09303, + "16": 0.0963, + "17": 0.10058, + "18": 0.09955, + "19": 0.10067, + "20": 0.10221, + "21": 0.09941, + "22": 0.09872, + "23": 0.09975, + "24": 0.10322, + "25": 0.09837, + "26": 0.09834, + "27": 0.09843, + "28": 0.09692, + "29": 0.09907, + "30": 0.09889, + "31": 0.10064, + "32": 0.09748, + "33": 0.09927, + "34": 0.09831, + "35": 0.09862, + "36": 0.09852, + "37": 0.09869, + "38": 0.09941, + "39": 0.09945, + "40": 0.10014, + "41": 0.09934, + "42": 0.10081, + "43": 0.10148, + "44": 0.09766, + "45": 0.09746, + "46": 0.09842, + "47": 0.09924, + "48": 0.09864, + "49": 0.09829, + "50": 0.09685 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json index b280d123468..eb5a06ac1fc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.16302, - "2": 0.10393, - "3": 0.10318, - "4": 0.08757, - "5": 0.08719, - "6": 0.08686, - "7": 0.08532, - "8": 0.0858, - "9": 0.08669, - "10": 0.08615, - "11": 0.08684, - "12": 0.08786, - "13": 0.15333, - "14": 0.08821, - "15": 0.18235, - "16": 0.08981, - "17": 0.08651, - "18": 0.0876, - "19": 0.08798, - "20": 0.08911, - "21": 0.08738, - "22": 0.08768, - "23": 0.08719, - "24": 0.087, - "25": 0.08861, - "26": 0.08768, - "27": 0.08826, - "28": 0.08976, - "29": 0.0886, - "30": 0.08951, - "31": 0.08933, - "32": 0.08963, - "33": 0.09543, - "34": 0.10061, - "35": 0.10664, - "36": 0.09906, - "37": 0.11365, - "38": 0.82081, - "39": 0.08864, - "40": 0.08743, - "41": 0.08722, - "42": 0.08656, - "43": 0.09145, - "44": 0.08801, - "45": 0.17031, - "46": 0.0894, - "47": 0.08943, - "48": 0.08707, - "49": 0.08683, - "50": 0.08738, - "51": 0.11089, - "52": 0.08833, - "53": 0.08713, - "54": 0.08847, - "55": 0.09031, - "56": 0.08636, - "57": 0.08753, - "58": 0.08716, - "59": 0.08699, - "60": 0.08807, - "61": 0.6943, - "62": 0.09219, - "63": 0.08631, - "64": 0.0882, - "65": 0.08874, - "66": 0.08909, - "67": 0.08792, - "68": 0.08836, - "69": 0.08825, - "70": 0.08851, - "71": 0.08764, - "72": 0.08728, - "73": 0.08806, - "74": 0.08749, - "75": 0.09031, - "76": 0.08768, - "77": 0.08844, - "78": 0.08914, - "79": 0.08957, - "80": 0.08909, - "81": 0.08925, - "82": 0.09031, - "83": 0.08817, - "84": 0.08786, - "85": 0.08912, - "86": 0.08785, - "87": 0.08907, - "88": 0.08837, - "89": 0.08812, - "90": 0.0872, - "91": 0.08931, - "92": 0.0876, - "93": 0.16836, - "94": 0.09054, - "95": 0.09081, - "96": 0.09078, - "97": 0.09068, - "98": 0.09042, - "99": 0.09008, - "100": 0.08863 + "1": "nan", + "2": 2.38769, + "3": 0.09884, + "4": 0.08514, + "5": 0.08435, + "6": 0.08412, + "7": 0.08558, + "8": 0.08425, + "9": 0.08436, + "10": 0.08457, + "11": 0.08469, + "12": 0.0848, + "13": 0.08487, + "14": 0.08571, + "15": 0.08487, + "16": 0.08529, + "17": 0.08559, + "18": 0.0898, + "19": 0.08482, + "20": 0.08509, + "21": 0.08527, + "22": 0.08597, + "23": 0.08592, + "24": 0.08654, + "25": 0.08608, + "26": 0.08574, + "27": 0.08542, + "28": 0.0856, + "29": 0.08581, + "30": 0.08539, + "31": 0.08675, + "32": 0.08679, + "33": 0.08699, + "34": 0.08599, + "35": 0.08568, + "36": 0.08528, + "37": 0.08527, + "38": 0.08526, + "39": 0.08614, + "40": 0.08507, + "41": 0.08552, + "42": 0.08586, + "43": 0.08568, + "44": 0.0866, + "45": 0.08692, + "46": 0.08614, + "47": 0.0859, + "48": 0.0863, + "49": 0.08723, + "50": 0.08703, + "51": 0.09195, + "52": 0.0775, + "53": 0.07822, + "54": 0.07813, + "55": 0.0784, + "56": 0.07871, + "57": 0.07816, + "58": 0.07787, + "59": 0.07958, + "60": 0.07893, + "61": 0.07873, + "62": 0.07887, + "63": 0.07945, + "64": 0.07879, + "65": 0.08059, + "66": 0.08041, + "67": 0.08127, + "68": 0.07996, + "69": 0.0799, + "70": 0.07821, + "71": 0.07712, + "72": 0.07745, + "73": 0.07774, + "74": 0.07859, + "75": 0.07741, + "76": 0.07753, + "77": 0.07725, + "78": 0.07676, + "79": 0.07838, + "80": 0.07786, + "81": 0.07743, + "82": 0.07732, + "83": 0.0773, + "84": 0.07664, + "85": 0.07753, + "86": 0.07826, + "87": 0.07764, + "88": 0.07681, + "89": 0.07911, + "90": 0.07799, + "91": 0.07796, + "92": 0.07853, + "93": 0.07736, + "94": 0.07777, + "95": 0.07791, + "96": 0.07723, + "97": 0.07753, + "98": 0.07789, + "99": 0.07782, + "100": 0.07733 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..5c26c56ee5a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.83614, + "52": 9.72315, + "53": 10.05318, + "54": 9.93747, + "55": 9.87384, + "56": 9.60449, + "57": 9.4523, + "58": 9.8188, + "59": 9.5772, + "60": 9.48534, + "61": 9.68548, + "62": 9.97906, + "63": 9.36419, + "64": 9.76203, + "65": 8.94097, + "66": 9.69475, + "67": 9.36656, + "68": 9.77745, + "69": 9.79001, + "70": 9.72374, + "71": 9.62037, + "72": 9.57423, + "73": 9.48575, + "74": 8.92729, + "75": 9.41651, + "76": 9.07747, + "77": 10.05444, + "78": 9.71914, + "79": 9.37306, + "80": 9.40003, + "81": 9.47844, + "82": 9.69867, + "83": 9.31155, + "84": 9.41457, + "85": 9.61163, + "86": 9.07418, + "87": 9.5939, + "88": 9.74928, + "89": 9.5985, + "90": 9.82761, + "91": 9.33631, + "92": 9.35805, + "93": 9.08552, + "94": 8.82786, + "95": 9.5303, + "96": 9.52663, + "97": 9.30483, + "98": 9.67007, + "99": 8.89606, + "100": 9.40702 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2650.0, + "52": 2700.0, + "53": 2863.0, + "54": 2676.0, + "55": 2390.0, + "56": 2753.0, + "57": 2430.0, + "58": 2919.0, + "59": 2831.0, + "60": 2428.0, + "61": 2932.0, + "62": 2724.0, + "63": 2579.0, + "64": 2987.0, + "65": 2506.0, + "66": 2886.0, + "67": 2871.0, + "68": 2870.0, + "69": 3001.0, + "70": 3294.0, + "71": 3043.0, + "72": 2614.0, + "73": 3054.0, + "74": 2024.0, + "75": 2507.0, + "76": 3020.0, + "77": 3253.0, + "78": 3230.0, + "79": 3210.0, + "80": 3252.0, + "81": 3614.0, + "82": 3395.0, + "83": 2919.0, + "84": 3296.0, + "85": 3320.0, + "86": 2865.0, + "87": 3931.0, + "88": 3240.0, + "89": 3428.0, + "90": 3127.0, + "91": 2815.0, + "92": 3098.0, + "93": 2796.0, + "94": 3324.0, + "95": 3428.0, + "96": 3541.0, + "97": 3216.0, + "98": 3705.0, + "99": 3184.0, + "100": 3073.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 581489664.0, + "52": 581489664.0, + "53": 581489664.0, + "54": 581489664.0, + "55": 581489664.0, + "56": 581489664.0, + "57": 581489664.0, + "58": 581489664.0, + "59": 581489664.0, + "60": 581489664.0, + "61": 581489664.0, + "62": 581489664.0, + "63": 581489664.0, + "64": 581489664.0, + "65": 581489664.0, + "66": 581489664.0, + "67": 581489664.0, + "68": 581489664.0, + "69": 581489664.0, + "70": 581489664.0, + "71": 581489664.0, + "72": 581489664.0, + "73": 581489664.0, + "74": 581489664.0, + "75": 581489664.0, + "76": 581489664.0, + "77": 581489664.0, + "78": 581489664.0, + "79": 581489664.0, + "80": 581489664.0, + "81": 581489664.0, + "82": 581489664.0, + "83": 581489664.0, + "84": 581489664.0, + "85": 581489664.0, + "86": 581489664.0, + "87": 581489664.0, + "88": 581489664.0, + "89": 581489664.0, + "90": 581489664.0, + "91": 581489664.0, + "92": 581489664.0, + "93": 581489664.0, + "94": 581489664.0, + "95": 581489664.0, + "96": 581489664.0, + "97": 581489664.0, + "98": 581489664.0, + "99": 581489664.0, + "100": 581489664.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2708568576.0, + "52": 2742124032.0, + "53": 2742124032.0, + "54": 2742124032.0, + "55": 2742124032.0, + "56": 2742124032.0, + "57": 2742124032.0, + "58": 2742124032.0, + "59": 2742124032.0, + "60": 2742124032.0, + "61": 2742124032.0, + "62": 2742124032.0, + "63": 2742124032.0, + "64": 2742124032.0, + "65": 2742124032.0, + "66": 2742124032.0, + "67": 2742124032.0, + "68": 2742124032.0, + "69": 2742124032.0, + "70": 2742124032.0, + "71": 2742124032.0, + "72": 2742124032.0, + "73": 2742124032.0, + "74": 2742124032.0, + "75": 2742124032.0, + "76": 2742124032.0, + "77": 2742124032.0, + "78": 2742124032.0, + "79": 2742124032.0, + "80": 2742124032.0, + "81": 2742124032.0, + "82": 2742124032.0, + "83": 2742124032.0, + "84": 2742124032.0, + "85": 2742124032.0, + "86": 2742124032.0, + "87": 2742124032.0, + "88": 2742124032.0, + "89": 2742124032.0, + "90": 2742124032.0, + "91": 2742124032.0, + "92": 2742124032.0, + "93": 2742124032.0, + "94": 2742124032.0, + "95": 2742124032.0, + "96": 2742124032.0, + "97": 2742124032.0, + "98": 2742124032.0, + "99": 2742124032.0, + "100": 2742124032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.1716, + "53": 0.09643, + "54": 0.08435, + "55": 0.08492, + "56": 0.08409, + "57": 0.08624, + "58": 0.08522, + "59": 0.08521, + "60": 0.08445, + "61": 0.08447, + "62": 0.08412, + "63": 0.08534, + "64": 0.08529, + "65": 0.08566, + "66": 0.08409, + "67": 0.08468, + "68": 0.08268, + "69": 0.08161, + "70": 0.08416, + "71": 0.08383, + "72": 0.08425, + "73": 0.08363, + "74": 0.08451, + "75": 0.08423, + "76": 0.08453, + "77": 0.08475, + "78": 0.08435, + "79": 0.0844, + "80": 0.08466, + "81": 0.08777, + "82": 0.08524, + "83": 0.08559, + "84": 0.08524, + "85": 0.08501, + "86": 0.08518, + "87": 0.08503, + "88": 0.08555, + "89": 0.0855, + "90": 0.08584, + "91": 0.08419, + "92": 0.08467, + "93": 0.08514, + "94": 0.08518, + "95": 0.08444, + "96": 0.08484, + "97": 0.08521, + "98": 0.08697, + "99": 0.08772, + "100": 0.08544 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json index ccf3054dcf0..a084bf35662 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 4.35869, - "2": 0.14764, - "3": 0.13356, - "4": 0.12382, - "5": 0.1223, - "6": 0.1198, - "7": 0.12014, - "8": 0.12507, - "9": 0.12529, - "10": 0.13316, - "11": 0.13102, - "12": 0.13889, - "13": 0.13638, - "14": 0.14898, - "15": 0.16074, - "16": 0.14966, - "17": 0.17711, - "18": 0.17201, - "19": 0.14817, - "20": 0.14956, - "21": 0.17491, - "22": 0.29045, - "23": 0.49855, - "24": 0.12704, - "25": 0.12527, - "26": 0.12833, - "27": 0.12762, - "28": 0.12497, - "29": 0.1258, - "30": 0.12747, - "31": 0.1272, - "32": 0.12749, - "33": 0.12753, - "34": 0.12763, - "35": 0.12697, - "36": 0.12734, - "37": 0.12802, - "38": 0.12925, - "39": 0.1278, - "40": 0.1273, - "41": 0.1284, - "42": 0.12646, - "43": 0.12669, - "44": 0.12781, - "45": 0.12751, - "46": 0.12772, - "47": 0.12712, - "48": 0.12664, - "49": 0.12679, - "50": 0.13142, - "51": 0.13902, - "52": 0.12241, - "53": 0.12407, - "54": 0.12462, - "55": 0.1225, - "56": 0.12498, - "57": 0.12564, - "58": 0.12627, - "59": 0.12399, - "60": 0.12468, - "61": 0.12629, - "62": 0.12645, - "63": 0.12377, - "64": 0.12505, - "65": 0.1271, - "66": 0.12603, - "67": 0.12556, - "68": 0.12634, - "69": 0.1332, - "70": 0.13504, - "71": 0.13164, - "72": 0.13511, - "73": 0.14002, - "74": 0.14488, - "75": 0.14064, - "76": 0.14236, - "77": 0.14155, - "78": 0.14042, - "79": 0.14188, - "80": 0.14414, - "81": 0.14147, - "82": 0.14264, - "83": 0.14126, - "84": 0.1423, - "85": 0.14311, - "86": 0.144, - "87": 0.1445, - "88": 0.14401, - "89": 0.14198, - "90": 0.14227, - "91": 0.14119, - "92": 0.14076, - "93": 0.14281, - "94": 0.14283, - "95": 0.1438, - "96": 0.14188, - "97": 0.14623, - "98": 0.14374, - "99": 0.14361, - "100": 0.14481 + "1": "nan", + "2": 2.02256, + "3": 0.13455, + "4": 0.12293, + "5": 0.12302, + "6": 0.1233, + "7": 0.12328, + "8": 0.12248, + "9": 0.12446, + "10": 0.12285, + "11": 0.12255, + "12": 0.12296, + "13": 0.12411, + "14": 0.12369, + "15": 0.12438, + "16": 0.12387, + "17": 0.12481, + "18": 0.12591, + "19": 0.12445, + "20": 0.12257, + "21": 0.12141, + "22": 0.12289, + "23": 0.12296, + "24": 0.12246, + "25": 0.12246, + "26": 0.12219, + "27": 0.12283, + "28": 0.12209, + "29": 0.12164, + "30": 0.12236, + "31": 0.1236, + "32": 0.12251, + "33": 0.12372, + "34": 0.12054, + "35": 0.12166, + "36": 0.12052, + "37": 0.12268, + "38": 0.12181, + "39": 0.12231, + "40": 0.1195, + "41": 0.12001, + "42": 0.12145, + "43": 0.12238, + "44": 0.12054, + "45": 0.11842, + "46": 0.11812, + "47": 0.11785, + "48": 0.11631, + "49": 0.11798, + "50": 0.11707, + "51": 0.12234, + "52": 0.11424, + "53": 0.11577, + "54": 0.11058, + "55": 0.11651, + "56": 0.12356, + "57": 0.12837, + "58": 0.1238, + "59": 0.13093, + "60": 0.14556, + "61": 0.1747, + "62": 0.14486, + "63": 0.15679, + "64": 0.14116, + "65": 0.13574, + "66": 0.16023, + "67": 0.14862, + "68": 0.14163, + "69": 0.14244, + "70": 0.13512, + "71": 0.1407, + "72": 0.14689, + "73": 0.13238, + "74": 0.1279, + "75": 0.12535, + "76": 0.12172, + "77": 0.12314, + "78": 0.12089, + "79": 0.11925, + "80": 0.11854, + "81": 0.11618, + "82": 0.11706, + "83": 0.11632, + "84": 0.11839, + "85": 0.11763, + "86": 0.11977, + "87": 0.12013, + "88": 0.11954, + "89": 0.11859, + "90": 0.11546, + "91": 0.11494, + "92": 0.1154, + "93": 0.11866, + "94": 0.25826, + "95": 0.11359, + "96": 0.11427, + "97": 0.11526, + "98": 0.11269, + "99": 0.11239, + "100": 0.11374 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..9853cec1655 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.83617, + "52": 9.72317, + "53": 10.05321, + "54": 9.93744, + "55": 9.87386, + "56": 9.60451, + "57": 9.45231, + "58": 9.81883, + "59": 9.57722, + "60": 9.48536, + "61": 9.68547, + "62": 9.97907, + "63": 9.36417, + "64": 9.76205, + "65": 8.94102, + "66": 9.69479, + "67": 9.36657, + "68": 9.77743, + "69": 9.78996, + "70": 9.72377, + "71": 9.62042, + "72": 9.57421, + "73": 9.48574, + "74": 8.92728, + "75": 9.41652, + "76": 9.07749, + "77": 10.05445, + "78": 9.71913, + "79": 9.37304, + "80": 9.40003, + "81": 9.47846, + "82": 9.69869, + "83": 9.31156, + "84": 9.41458, + "85": 9.61162, + "86": 9.07419, + "87": 9.59392, + "88": 9.74925, + "89": 9.59851, + "90": 9.82763, + "91": 9.33629, + "92": 9.35804, + "93": 9.08549, + "94": 8.8279, + "95": 9.53033, + "96": 9.52662, + "97": 9.30484, + "98": 9.67007, + "99": 8.89604, + "100": 9.407 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2708.0, + "52": 2707.0, + "53": 2812.0, + "54": 2620.0, + "55": 2399.0, + "56": 2790.0, + "57": 2301.0, + "58": 3008.0, + "59": 2863.0, + "60": 2465.0, + "61": 2808.0, + "62": 2607.0, + "63": 2442.0, + "64": 2977.0, + "65": 2646.0, + "66": 3061.0, + "67": 2818.0, + "68": 2891.0, + "69": 3036.0, + "70": 3160.0, + "71": 3064.0, + "72": 2618.0, + "73": 2978.0, + "74": 2000.0, + "75": 2580.0, + "76": 2967.0, + "77": 3281.0, + "78": 3131.0, + "79": 3108.0, + "80": 3217.0, + "81": 3614.0, + "82": 3411.0, + "83": 2834.0, + "84": 3191.0, + "85": 3306.0, + "86": 2806.0, + "87": 3808.0, + "88": 3237.0, + "89": 3425.0, + "90": 3202.0, + "91": 2829.0, + "92": 3105.0, + "93": 2882.0, + "94": 3303.0, + "95": 3310.0, + "96": 3499.0, + "97": 3211.0, + "98": 3741.0, + "99": 3167.0, + "100": 3049.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1261849088.0, + "52": 1261849088.0, + "53": 1261849088.0, + "54": 1261849088.0, + "55": 1261849088.0, + "56": 1261849088.0, + "57": 1261849088.0, + "58": 1261849088.0, + "59": 1261849088.0, + "60": 1261849088.0, + "61": 1261849088.0, + "62": 1261849088.0, + "63": 1261849088.0, + "64": 1261849088.0, + "65": 1261849088.0, + "66": 1261849088.0, + "67": 1261849088.0, + "68": 1261849088.0, + "69": 1261849088.0, + "70": 1261849088.0, + "71": 1261849088.0, + "72": 1261849088.0, + "73": 1261849088.0, + "74": 1261849088.0, + "75": 1261849088.0, + "76": 1261849088.0, + "77": 1261849088.0, + "78": 1261849088.0, + "79": 1261849088.0, + "80": 1261849088.0, + "81": 1261849088.0, + "82": 1261849088.0, + "83": 1261849088.0, + "84": 1261849088.0, + "85": 1261849088.0, + "86": 1261849088.0, + "87": 1261849088.0, + "88": 1261849088.0, + "89": 1261849088.0, + "90": 1261849088.0, + "91": 1261849088.0, + "92": 1261849088.0, + "93": 1261849088.0, + "94": 1261849088.0, + "95": 1261849088.0, + "96": 1261849088.0, + "97": 1261849088.0, + "98": 1261849088.0, + "99": 1261849088.0, + "100": 1261849088.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2530924544.0, + "52": 2564480000.0, + "53": 2564480000.0, + "54": 2564480000.0, + "55": 2564480000.0, + "56": 2564480000.0, + "57": 2564480000.0, + "58": 2564480000.0, + "59": 2564480000.0, + "60": 2564480000.0, + "61": 2564480000.0, + "62": 2564480000.0, + "63": 2564480000.0, + "64": 2564480000.0, + "65": 2564480000.0, + "66": 2564480000.0, + "67": 2564480000.0, + "68": 2564480000.0, + "69": 2564480000.0, + "70": 2564480000.0, + "71": 2564480000.0, + "72": 2564480000.0, + "73": 2564480000.0, + "74": 2564480000.0, + "75": 2564480000.0, + "76": 2564480000.0, + "77": 2564480000.0, + "78": 2564480000.0, + "79": 2564480000.0, + "80": 2564480000.0, + "81": 2564480000.0, + "82": 2564480000.0, + "83": 2564480000.0, + "84": 2564480000.0, + "85": 2564480000.0, + "86": 2564480000.0, + "87": 2564480000.0, + "88": 2564480000.0, + "89": 2564480000.0, + "90": 2564480000.0, + "91": 2564480000.0, + "92": 2564480000.0, + "93": 2564480000.0, + "94": 2564480000.0, + "95": 2564480000.0, + "96": 2564480000.0, + "97": 2564480000.0, + "98": 2564480000.0, + "99": 2564480000.0, + "100": 2564480000.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 1.73474, + "53": 0.13329, + "54": 0.12193, + "55": 0.12308, + "56": 0.12634, + "57": 0.12745, + "58": 0.12425, + "59": 0.12199, + "60": 0.12359, + "61": 0.11982, + "62": 0.12161, + "63": 0.11993, + "64": 0.12221, + "65": 0.12364, + "66": 0.12245, + "67": 0.12126, + "68": 0.12211, + "69": 0.11961, + "70": 0.12166, + "71": 0.11999, + "72": 0.12512, + "73": 0.12157, + "74": 0.11996, + "75": 0.12183, + "76": 0.11982, + "77": 0.1205, + "78": 0.12225, + "79": 0.12245, + "80": 0.12222, + "81": 0.12087, + "82": 0.11834, + "83": 0.11849, + "84": 0.11754, + "85": 0.1168, + "86": 0.11739, + "87": 0.11786, + "88": 0.1178, + "89": 0.11801, + "90": 0.11757, + "91": 0.11668, + "92": 0.11659, + "93": 0.11656, + "94": 0.11741, + "95": 0.11613, + "96": 0.11801, + "97": 0.11735, + "98": 0.1168, + "99": 0.11724, + "100": 0.1175 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json index 5470d60dcdb..684fd4831fe 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json @@ -433,105 +433,105 @@ "step_interval": 1, "values": { "1": "nan", - "2": 3.09607, - "3": 0.15089, - "4": 0.16387, - "5": 0.13417, - "6": 0.12738, - "7": 0.12788, - "8": 0.132, - "9": 0.28261, - "10": 0.12697, - "11": 0.13182, - "12": 0.13355, - "13": 0.13045, - "14": 0.13241, - "15": 0.1311, - "16": 0.13108, - "17": 0.13531, - "18": 0.13102, - "19": 0.13307, - "20": 0.13285, - "21": 0.13577, - "22": 0.13581, - "23": 0.13315, - "24": 0.13227, - "25": 0.13286, - "26": 0.13451, - "27": 0.13303, - "28": 0.13253, - "29": 0.29925, - "30": 0.13379, - "31": 0.13315, - "32": 0.13374, - "33": 0.13205, - "34": 0.13144, - "35": 0.13199, - "36": 0.13191, - "37": 0.13367, - "38": 0.13204, - "39": 0.13375, - "40": 0.1347, - "41": 0.13056, - "42": 0.13244, - "43": 0.13361, - "44": 0.13216, - "45": 0.13279, - "46": 0.12873, - "47": 0.13055, - "48": 0.13334, - "49": 0.1341, - "50": 0.13588, - "51": 0.1385, - "52": 0.12954, - "53": 0.1309, - "54": 0.12956, - "55": 0.12942, - "56": 0.12835, - "57": 0.13126, - "58": 0.13085, - "59": 0.17194, - "60": 0.12864, - "61": 0.13121, - "62": 0.13254, - "63": 0.17379, - "64": 0.1288, - "65": 0.13106, - "66": 0.13033, - "67": 0.13051, - "68": 0.12867, - "69": 0.13001, - "70": 0.12842, - "71": 0.13086, - "72": 0.13042, - "73": 0.13305, - "74": 0.13253, - "75": 0.13136, - "76": 0.13325, - "77": 0.13253, - "78": 0.13157, - "79": 0.13256, - "80": 0.13095, - "81": 0.13101, - "82": 0.13389, - "83": 0.13228, - "84": 0.13283, - "85": 0.13274, - "86": 0.13308, - "87": 0.13089, - "88": 0.13159, - "89": 0.13218, - "90": 0.13253, - "91": 0.13284, - "92": 0.13376, - "93": 0.13307, - "94": 0.13459, - "95": 0.13415, - "96": 0.13629, - "97": 0.13635, - "98": 0.1381, - "99": 0.13441, - "100": 0.1359 + "2": 2.22064, + "3": 0.13024, + "4": 0.11768, + "5": 0.11875, + "6": 0.11742, + "7": 0.11821, + "8": 0.11878, + "9": 0.11922, + "10": 0.11834, + "11": 0.11707, + "12": 0.1171, + "13": 0.11874, + "14": 0.12245, + "15": 0.11821, + "16": 0.1177, + "17": 0.11857, + "18": 0.11778, + "19": 0.1187, + "20": 0.11835, + "21": 0.12351, + "22": 0.11771, + "23": 0.11773, + "24": 0.11819, + "25": 0.11705, + "26": 0.12602, + "27": 0.12585, + "28": 0.12677, + "29": 0.12752, + "30": 0.12847, + "31": 0.12883, + "32": 0.12586, + "33": 0.12437, + "34": 0.12277, + "35": 0.12212, + "36": 0.12255, + "37": 0.12084, + "38": 0.12104, + "39": 0.12124, + "40": 0.12086, + "41": 0.12101, + "42": 0.11969, + "43": 0.1197, + "44": 0.11956, + "45": 0.11977, + "46": 0.11865, + "47": 0.11795, + "48": 0.11928, + "49": 0.11794, + "50": 0.11851, + "51": 0.12726, + "52": 0.11929, + "53": 0.11813, + "54": 0.11818, + "55": 0.11748, + "56": 0.11707, + "57": 0.11896, + "58": 0.11832, + "59": 0.11799, + "60": 0.11784, + "61": 0.11888, + "62": 0.11879, + "63": 0.11819, + "64": 0.1185, + "65": 0.11926, + "66": 0.11924, + "67": 0.11982, + "68": 0.11873, + "69": 0.11986, + "70": 0.11895, + "71": 0.11964, + "72": 0.11906, + "73": 0.12017, + "74": 0.11976, + "75": 0.11759, + "76": 0.11921, + "77": 0.11907, + "78": 0.11823, + "79": 0.11867, + "80": 0.11934, + "81": 0.11888, + "82": 0.11988, + "83": 0.1213, + "84": 0.11913, + "85": 0.12002, + "86": 0.12046, + "87": 0.11952, + "88": 0.11819, + "89": 0.11901, + "90": 0.11918, + "91": 0.11919, + "92": 0.11824, + "93": 0.12018, + "94": 0.11929, + "95": 0.11974, + "96": 0.11767, + "97": 0.11845, + "98": 0.11695, + "99": 0.11892, + "100": 0.11948 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..f92f1c4672e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.76757, + "52": 9.64732, + "53": 9.995, + "54": 9.88603, + "55": 9.81763, + "56": 9.53914, + "57": 9.38192, + "58": 9.75896, + "59": 9.52106, + "60": 9.42443, + "61": 9.63665, + "62": 9.92974, + "63": 9.29595, + "64": 9.70631, + "65": 8.88066, + "66": 9.64072, + "67": 9.32146, + "68": 9.73692, + "69": 9.75346, + "70": 9.68289, + "71": 9.58117, + "72": 9.52491, + "73": 9.44094, + "74": 8.86077, + "75": 9.36671, + "76": 9.01691, + "77": 10.02224, + "78": 9.68354, + "79": 9.33325, + "80": 9.3582, + "81": 9.43786, + "82": 9.66102, + "83": 9.26223, + "84": 9.37189, + "85": 9.56652, + "86": 9.04493, + "87": 9.5575, + "88": 9.70541, + "89": 9.55092, + "90": 9.79196, + "91": 9.29173, + "92": 9.31225, + "93": 9.0433, + "94": 8.78683, + "95": 9.49525, + "96": 9.48391, + "97": 9.25966, + "98": 9.62611, + "99": 8.85031, + "100": 9.36043 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2613.0, + "52": 2647.0, + "53": 2908.0, + "54": 2580.0, + "55": 2486.0, + "56": 2687.0, + "57": 2577.0, + "58": 2824.0, + "59": 2720.0, + "60": 2410.0, + "61": 2744.0, + "62": 2536.0, + "63": 2652.0, + "64": 2918.0, + "65": 2742.0, + "66": 2927.0, + "67": 2920.0, + "68": 2652.0, + "69": 3019.0, + "70": 2996.0, + "71": 2835.0, + "72": 2664.0, + "73": 3211.0, + "74": 2311.0, + "75": 2658.0, + "76": 3155.0, + "77": 3051.0, + "78": 3073.0, + "79": 3116.0, + "80": 3191.0, + "81": 3237.0, + "82": 3218.0, + "83": 2689.0, + "84": 3294.0, + "85": 3209.0, + "86": 2558.0, + "87": 3462.0, + "88": 3287.0, + "89": 3201.0, + "90": 3331.0, + "91": 3183.0, + "92": 3201.0, + "93": 2942.0, + "94": 3274.0, + "95": 3132.0, + "96": 3200.0, + "97": 3054.0, + "98": 3544.0, + "99": 3387.0, + "100": 3192.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 917459968.0, + "52": 917459968.0, + "53": 917459968.0, + "54": 917459968.0, + "55": 917459968.0, + "56": 917459968.0, + "57": 917459968.0, + "58": 917459968.0, + "59": 917459968.0, + "60": 917459968.0, + "61": 917459968.0, + "62": 917459968.0, + "63": 917459968.0, + "64": 917459968.0, + "65": 917459968.0, + "66": 917459968.0, + "67": 917459968.0, + "68": 917459968.0, + "69": 917459968.0, + "70": 917459968.0, + "71": 917459968.0, + "72": 917459968.0, + "73": 917459968.0, + "74": 917459968.0, + "75": 917459968.0, + "76": 917459968.0, + "77": 917459968.0, + "78": 917459968.0, + "79": 917459968.0, + "80": 917459968.0, + "81": 917459968.0, + "82": 917459968.0, + "83": 917459968.0, + "84": 917459968.0, + "85": 917459968.0, + "86": 917459968.0, + "87": 917459968.0, + "88": 917459968.0, + "89": 917459968.0, + "90": 917459968.0, + "91": 917459968.0, + "92": 917459968.0, + "93": 917459968.0, + "94": 917459968.0, + "95": 917459968.0, + "96": 917459968.0, + "97": 917459968.0, + "98": 917459968.0, + "99": 917459968.0, + "100": 917459968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2622354432.0, + "52": 2622355456.0, + "53": 2622355456.0, + "54": 2622355456.0, + "55": 2622355456.0, + "56": 2622355456.0, + "57": 2622355456.0, + "58": 2622355456.0, + "59": 2622355456.0, + "60": 2622355456.0, + "61": 2622355456.0, + "62": 2622355456.0, + "63": 2622355456.0, + "64": 2622355456.0, + "65": 2622355456.0, + "66": 2622355456.0, + "67": 2622355456.0, + "68": 2622355456.0, + "69": 2622355456.0, + "70": 2622355456.0, + "71": 2622355456.0, + "72": 2622355456.0, + "73": 2622355456.0, + "74": 2622355456.0, + "75": 2622355456.0, + "76": 2622355456.0, + "77": 2622355456.0, + "78": 2622355456.0, + "79": 2622355456.0, + "80": 2622355456.0, + "81": 2622355456.0, + "82": 2622355456.0, + "83": 2622355456.0, + "84": 2622355456.0, + "85": 2622355456.0, + "86": 2622355456.0, + "87": 2622355456.0, + "88": 2622355456.0, + "89": 2622355456.0, + "90": 2622355456.0, + "91": 2622355456.0, + "92": 2622355456.0, + "93": 2622355456.0, + "94": 2622355456.0, + "95": 2622355456.0, + "96": 2622355456.0, + "97": 2622355456.0, + "98": 2622355456.0, + "99": 2622355456.0, + "100": 2622355456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.13245, + "53": 0.1297, + "54": 0.11767, + "55": 0.11927, + "56": 0.12061, + "57": 0.12305, + "58": 0.12043, + "59": 0.11822, + "60": 0.11725, + "61": 0.11813, + "62": 0.11746, + "63": 0.11736, + "64": 0.11897, + "65": 0.12036, + "66": 0.11746, + "67": 0.11937, + "68": 0.11862, + "69": 0.11914, + "70": 0.11949, + "71": 0.11638, + "72": 0.11794, + "73": 0.11866, + "74": 0.11751, + "75": 0.11637, + "76": 0.11834, + "77": 0.11768, + "78": 0.11854, + "79": 0.11727, + "80": 0.11732, + "81": 0.11811, + "82": 0.11878, + "83": 0.11805, + "84": 0.11921, + "85": 0.11932, + "86": 0.11908, + "87": 0.12476, + "88": 0.12628, + "89": 0.12876, + "90": 0.12617, + "91": 0.12743, + "92": 0.12783, + "93": 0.12812, + "94": 0.12493, + "95": 0.12584, + "96": 0.12791, + "97": 0.12455, + "98": 0.1269, + "99": 0.12715, + "100": 0.12714 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json index 13a8f84be7c..d7fe14d5d09 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 6.53475, - "2": 0.15984, - "3": 0.14377, - "4": 0.29674, - "5": 0.13063, - "6": 0.13043, - "7": 0.13235, - "8": 0.13474, - "9": 0.13363, - "10": 0.13339, - "11": 0.13581, - "12": 0.13012, - "13": 0.13019, - "14": 0.13252, - "15": 0.13313, - "16": 0.13357, - "17": 0.13327, - "18": 0.13417, - "19": 0.1331, - "20": 0.1329, - "21": 0.13223, - "22": 0.32857, - "23": 0.13492, - "24": 0.13386, - "25": 0.13206, - "26": 0.13477, - "27": 0.13149, - "28": 0.13502, - "29": 0.13417, - "30": 0.13168, - "31": 0.13588, - "32": 0.13436, - "33": 0.13143, - "34": 0.13205, - "35": 0.13068, - "36": 0.13116, - "37": 0.13337, - "38": 0.22586, - "39": 0.13222, - "40": 0.13032, - "41": 0.13333, - "42": 0.13093, - "43": 0.13146, - "44": 0.17904, - "45": 0.13291, - "46": 0.13299, - "47": 0.13217, - "48": 0.19742, - "49": 0.24879, - "50": 0.13041, - "51": 0.17217, - "52": 0.14728, - "53": 0.14883, - "54": 0.15217, - "55": 0.15333, - "56": 0.15162, - "57": 0.14349, - "58": 0.5576, - "59": 0.13842, - "60": 0.13366, - "61": 0.13505, - "62": 0.13481, - "63": 0.13475, - "64": 0.13594, - "65": 0.13184, - "66": 0.13558, - "67": 0.1672, - "68": 0.13268, - "69": 0.13176, - "70": 0.13495, - "71": 0.13816, - "72": 0.13681, - "73": 0.13679, - "74": 0.13748, - "75": 0.13564, - "76": 0.13376, - "77": 0.13018, - "78": 0.13137, - "79": 0.13475, - "80": 0.1358, - "81": 0.1337, - "82": 0.13153, - "83": 0.13119, - "84": 0.13428, - "85": 0.15135, - "86": 0.13542, - "87": 0.13379, - "88": 0.13317, - "89": 0.13159, - "90": 0.1344, - "91": 0.13415, - "92": 0.1338, - "93": 0.13311, - "94": 0.13567, - "95": 0.13426, - "96": 0.13525, - "97": 0.13575, - "98": 0.133, - "99": 0.13093, - "100": 0.13623 + "1": "nan", + "2": 2.44626, + "3": 0.14544, + "4": 0.14069, + "5": 0.13132, + "6": 0.13447, + "7": 0.13519, + "8": 0.13562, + "9": 0.13513, + "10": 0.13387, + "11": 0.13378, + "12": 0.13575, + "13": 0.13462, + "14": 0.13384, + "15": 0.13412, + "16": 0.13347, + "17": 0.13555, + "18": 0.13515, + "19": 0.13443, + "20": 0.14433, + "21": 0.14638, + "22": 0.14561, + "23": 0.13968, + "24": 0.13694, + "25": 0.14479, + "26": 0.14038, + "27": 0.1473, + "28": 0.14099, + "29": 0.13829, + "30": 0.13782, + "31": 0.13746, + "32": 0.13573, + "33": 0.13325, + "34": 0.14385, + "35": 0.14367, + "36": 0.14113, + "37": 0.1394, + "38": 0.136, + "39": 0.13678, + "40": 0.13539, + "41": 0.1364, + "42": 0.13593, + "43": 0.13738, + "44": 0.13238, + "45": 0.13667, + "46": 0.14472, + "47": 0.1358, + "48": 0.13697, + "49": 0.13391, + "50": 0.13536, + "51": 0.16637, + "52": 0.15213, + "53": 0.14685, + "54": 0.14134, + "55": 0.14007, + "56": 0.13524, + "57": 0.13779, + "58": 0.13841, + "59": 0.13821, + "60": 0.13687, + "61": 0.13663, + "62": 0.13401, + "63": 0.13389, + "64": 0.13289, + "65": 0.13362, + "66": 0.13754, + "67": 0.13473, + "68": 0.13402, + "69": 0.13491, + "70": 0.13536, + "71": 0.13258, + "72": 0.13482, + "73": 0.13371, + "74": 0.13507, + "75": 0.13595, + "76": 0.13613, + "77": 0.13395, + "78": 0.13252, + "79": 0.13394, + "80": 0.13329, + "81": 0.13388, + "82": 0.13407, + "83": 0.13522, + "84": 0.13579, + "85": 0.13452, + "86": 0.13422, + "87": 0.13388, + "88": 0.1343, + "89": 0.13546, + "90": 0.13522, + "91": 0.13458, + "92": 0.1341, + "93": 0.13519, + "94": 0.13534, + "95": 0.13521, + "96": 0.13886, + "97": 0.13832, + "98": 0.14048, + "99": 0.14022, + "100": 0.13732 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..13e04822669 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.76773, + "52": 9.64757, + "53": 9.99521, + "54": 9.88624, + "55": 9.81783, + "56": 9.53944, + "57": 9.38198, + "58": 9.75913, + "59": 9.52125, + "60": 9.42463, + "61": 9.63669, + "62": 9.93001, + "63": 9.29627, + "64": 9.70638, + "65": 8.88076, + "66": 9.64079, + "67": 9.32154, + "68": 9.737, + "69": 9.75369, + "70": 9.68294, + "71": 9.58129, + "72": 9.52492, + "73": 9.44113, + "74": 8.86077, + "75": 9.3667, + "76": 9.01682, + "77": 10.0224, + "78": 9.68369, + "79": 9.33323, + "80": 9.35819, + "81": 9.43805, + "82": 9.66108, + "83": 9.26227, + "84": 9.37195, + "85": 9.56661, + "86": 9.04515, + "87": 9.55767, + "88": 9.70545, + "89": 9.55104, + "90": 9.79186, + "91": 9.29174, + "92": 9.31247, + "93": 9.04313, + "94": 8.7869, + "95": 9.49543, + "96": 9.48418, + "97": 9.25973, + "98": 9.62635, + "99": 8.85054, + "100": 9.36076 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2647.0, + "52": 2648.0, + "53": 2878.0, + "54": 2654.0, + "55": 2580.0, + "56": 2658.0, + "57": 2547.0, + "58": 2739.0, + "59": 2779.0, + "60": 2349.0, + "61": 2741.0, + "62": 2617.0, + "63": 2512.0, + "64": 2800.0, + "65": 2697.0, + "66": 2966.0, + "67": 2952.0, + "68": 2833.0, + "69": 3029.0, + "70": 2977.0, + "71": 2813.0, + "72": 2664.0, + "73": 3085.0, + "74": 2292.0, + "75": 2810.0, + "76": 3025.0, + "77": 3025.0, + "78": 3037.0, + "79": 3181.0, + "80": 3234.0, + "81": 3273.0, + "82": 3294.0, + "83": 2707.0, + "84": 3332.0, + "85": 3336.0, + "86": 2585.0, + "87": 3448.0, + "88": 3239.0, + "89": 3137.0, + "90": 3341.0, + "91": 3188.0, + "92": 3246.0, + "93": 2823.0, + "94": 3358.0, + "95": 3202.0, + "96": 3118.0, + "97": 3163.0, + "98": 3645.0, + "99": 3345.0, + "100": 3201.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 917459968.0, + "52": 917459968.0, + "53": 917459968.0, + "54": 917459968.0, + "55": 917459968.0, + "56": 917459968.0, + "57": 917459968.0, + "58": 917459968.0, + "59": 917459968.0, + "60": 917459968.0, + "61": 917459968.0, + "62": 917459968.0, + "63": 917459968.0, + "64": 917459968.0, + "65": 917459968.0, + "66": 917459968.0, + "67": 917459968.0, + "68": 917459968.0, + "69": 917459968.0, + "70": 917459968.0, + "71": 917459968.0, + "72": 917459968.0, + "73": 917459968.0, + "74": 917459968.0, + "75": 917459968.0, + "76": 917459968.0, + "77": 917459968.0, + "78": 917459968.0, + "79": 917459968.0, + "80": 917459968.0, + "81": 917459968.0, + "82": 917459968.0, + "83": 917459968.0, + "84": 917459968.0, + "85": 917459968.0, + "86": 917459968.0, + "87": 917459968.0, + "88": 917459968.0, + "89": 917459968.0, + "90": 917459968.0, + "91": 917459968.0, + "92": 917459968.0, + "93": 917459968.0, + "94": 917459968.0, + "95": 917459968.0, + "96": 917459968.0, + "97": 917459968.0, + "98": 917459968.0, + "99": 917459968.0, + "100": 917459968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2625500160.0, + "52": 2625501184.0, + "53": 2625501184.0, + "54": 2625501184.0, + "55": 2625501184.0, + "56": 2625501184.0, + "57": 2625501184.0, + "58": 2625501184.0, + "59": 2625501184.0, + "60": 2625501184.0, + "61": 2625501184.0, + "62": 2625501184.0, + "63": 2625501184.0, + "64": 2625501184.0, + "65": 2625501184.0, + "66": 2625501184.0, + "67": 2625501184.0, + "68": 2625501184.0, + "69": 2625501184.0, + "70": 2625501184.0, + "71": 2625501184.0, + "72": 2625501184.0, + "73": 2625501184.0, + "74": 2625501184.0, + "75": 2625501184.0, + "76": 2625501184.0, + "77": 2625501184.0, + "78": 2625501184.0, + "79": 2625501184.0, + "80": 2625501184.0, + "81": 2625501184.0, + "82": 2625501184.0, + "83": 2625501184.0, + "84": 2625501184.0, + "85": 2625501184.0, + "86": 2625501184.0, + "87": 2625501184.0, + "88": 2625501184.0, + "89": 2625501184.0, + "90": 2625501184.0, + "91": 2625501184.0, + "92": 2625501184.0, + "93": 2625501184.0, + "94": 2625501184.0, + "95": 2625501184.0, + "96": 2625501184.0, + "97": 2625501184.0, + "98": 2625501184.0, + "99": 2625501184.0, + "100": 2625501184.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.60445, + "53": 0.14686, + "54": 0.13325, + "55": 0.13174, + "56": 0.13234, + "57": 0.13268, + "58": 0.13337, + "59": 0.13324, + "60": 0.13107, + "61": 0.13206, + "62": 0.1329, + "63": 0.13379, + "64": 0.1348, + "65": 0.13602, + "66": 0.13298, + "67": 0.13465, + "68": 0.13495, + "69": 0.13454, + "70": 0.13536, + "71": 0.13494, + "72": 0.13541, + "73": 0.13997, + "74": 0.1423, + "75": 0.13785, + "76": 0.14664, + "77": 0.16548, + "78": 0.17704, + "79": 0.15011, + "80": 0.14471, + "81": 0.13952, + "82": 0.13892, + "83": 0.13568, + "84": 0.13463, + "85": 0.13878, + "86": 0.13867, + "87": 0.13899, + "88": 0.13819, + "89": 0.13945, + "90": 0.13964, + "91": 0.13862, + "92": 0.13655, + "93": 0.13587, + "94": 0.13572, + "95": 0.1357, + "96": 0.13598, + "97": 0.13642, + "98": 0.13742, + "99": 0.13474, + "100": 0.13647 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json index 4143efc2988..63c74381364 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.52284, - "2": 0.157, - "3": 0.14283, - "4": 0.12717, - "5": 0.23804, - "6": 0.12672, - "7": 0.23745, - "8": 0.12054, - "9": 0.21684, - "10": 0.11896, - "11": 0.13284, - "12": 0.11855, - "13": 0.11845, - "14": 0.11744, - "15": 0.11809, - "16": 0.11959, - "17": 0.11704, - "18": 0.22382, - "19": 0.30417, - "20": 0.13849, - "21": 0.11644, - "22": 0.24942, - "23": 0.11902, - "24": 0.11673, - "25": 0.11881, - "26": 0.11714, - "27": 0.26517, - "28": 0.11796, - "29": 0.11692, - "30": 0.1177, - "31": 0.1199, - "32": 0.11855, - "33": 0.20894, - "34": 0.1189, - "35": 0.11946, - "36": 0.11731, - "37": 0.11585, - "38": 0.22438, - "39": 0.11586, - "40": 0.31661, - "41": 0.27224, - "42": 0.11828, - "43": 0.11576, - "44": 0.31558, - "45": 0.11735, - "46": 0.11931, - "47": 0.2329, - "48": 0.20057, - "49": 0.11638, - "50": 0.14553, - "51": 0.15092, - "52": 0.12868, - "53": 0.29978, - "54": 0.13487, - "55": 0.1206, - "56": 0.117, - "57": 0.117, - "58": 0.11712, - "59": 0.11789, - "60": 0.11693, - "61": 0.11525, - "62": 0.24109, - "63": 0.11906, - "64": 0.12054, - "65": 0.11805, - "66": 0.11831, - "67": 0.11744, - "68": 0.11454, - "69": 0.39474, - "70": 0.11683, - "71": 0.117, - "72": 0.11875, - "73": 0.28446, - "74": 0.22373, - "75": 0.11573, - "76": 0.1177, - "77": 0.11707, - "78": 0.24184, - "79": 0.11755, - "80": 0.11784, - "81": 0.21803, - "82": 0.11787, - "83": 0.23349, - "84": 0.22596, - "85": 0.11587, - "86": 0.11507, - "87": 0.16522, - "88": 0.24306, - "89": 0.12003, - "90": 0.23071, - "91": 0.12051, - "92": 0.12072, - "93": 0.11991, - "94": 0.22186, - "95": 0.12105, - "96": 0.12128, - "97": 0.11916, - "98": 0.12303, - "99": 0.1197, - "100": 0.1207 + "1": "nan", + "2": 2.20838, + "3": 0.13042, + "4": 0.11826, + "5": 0.11718, + "6": 0.11797, + "7": 0.1177, + "8": 0.11717, + "9": 0.11846, + "10": 0.11778, + "11": 0.11712, + "12": 0.11866, + "13": 0.12004, + "14": 0.11788, + "15": 0.11787, + "16": 0.1181, + "17": 0.11903, + "18": 0.11843, + "19": 0.11754, + "20": 0.11834, + "21": 0.11897, + "22": 0.12726, + "23": 0.13834, + "24": 0.15039, + "25": 0.14107, + "26": 0.14586, + "27": 0.16343, + "28": 0.2297, + "29": 0.26681, + "30": 0.19748, + "31": 0.2586, + "32": 0.12068, + "33": 0.11944, + "34": 0.11896, + "35": 0.11984, + "36": 0.11823, + "37": 0.11997, + "38": 0.11949, + "39": 0.11877, + "40": 0.11898, + "41": 0.11996, + "42": 0.11893, + "43": 0.12547, + "44": 0.13195, + "45": 0.12144, + "46": 0.11997, + "47": 0.12005, + "48": 0.11855, + "49": 0.11944, + "50": 0.11842, + "51": 0.14635, + "52": 0.12016, + "53": 0.11762, + "54": 0.11802, + "55": 0.1184, + "56": 0.11774, + "57": 0.12181, + "58": 0.11784, + "59": 0.11936, + "60": 0.11831, + "61": 0.11819, + "62": 0.11807, + "63": 0.11828, + "64": 0.11663, + "65": 0.11901, + "66": 0.1168, + "67": 0.1167, + "68": 0.12002, + "69": 0.12016, + "70": 0.1186, + "71": 0.11772, + "72": 0.1189, + "73": 0.11915, + "74": 0.11908, + "75": 0.11898, + "76": 0.11863, + "77": 0.11869, + "78": 0.11971, + "79": 0.11843, + "80": 0.1198, + "81": 0.12003, + "82": 0.11885, + "83": 0.11905, + "84": 0.12002, + "85": 0.1192, + "86": 0.11872, + "87": 0.11777, + "88": 0.11801, + "89": 0.11864, + "90": 0.11769, + "91": 0.11692, + "92": 0.12015, + "93": 0.12072, + "94": 0.11802, + "95": 0.11798, + "96": 0.12278, + "97": 0.11941, + "98": 0.1174, + "99": 0.11816, + "100": 0.12102 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..12556b60c96 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.83807, + "52": 9.72058, + "53": 10.0568, + "54": 9.95032, + "55": 9.88328, + "56": 9.60431, + "57": 9.45518, + "58": 9.81927, + "59": 9.58262, + "60": 9.48844, + "61": 9.68577, + "62": 9.97779, + "63": 9.36765, + "64": 9.75913, + "65": 8.9376, + "66": 9.69257, + "67": 9.36621, + "68": 9.78303, + "69": 9.79318, + "70": 9.72699, + "71": 9.62875, + "72": 9.58004, + "73": 9.487, + "74": 8.92041, + "75": 9.41128, + "76": 9.07564, + "77": 10.05848, + "78": 9.72184, + "79": 9.3732, + "80": 9.40079, + "81": 9.4792, + "82": 9.69754, + "83": 9.31037, + "84": 9.41777, + "85": 9.61194, + "86": 9.07155, + "87": 9.59661, + "88": 9.74709, + "89": 9.59667, + "90": 9.82915, + "91": 9.33725, + "92": 9.3564, + "93": 9.08552, + "94": 8.82807, + "95": 9.52842, + "96": 9.52611, + "97": 9.30632, + "98": 9.66808, + "99": 8.89461, + "100": 9.40666 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2546.0, + "52": 2590.0, + "53": 2879.0, + "54": 2697.0, + "55": 2316.0, + "56": 2549.0, + "57": 2261.0, + "58": 2904.0, + "59": 2740.0, + "60": 2434.0, + "61": 2801.0, + "62": 2663.0, + "63": 2502.0, + "64": 2948.0, + "65": 2644.0, + "66": 2961.0, + "67": 2813.0, + "68": 2686.0, + "69": 2912.0, + "70": 3096.0, + "71": 2854.0, + "72": 2454.0, + "73": 3081.0, + "74": 1933.0, + "75": 2465.0, + "76": 3012.0, + "77": 3163.0, + "78": 2997.0, + "79": 3089.0, + "80": 3187.0, + "81": 3500.0, + "82": 3339.0, + "83": 2705.0, + "84": 3205.0, + "85": 3033.0, + "86": 2818.0, + "87": 3671.0, + "88": 3190.0, + "89": 3336.0, + "90": 3320.0, + "91": 2698.0, + "92": 3072.0, + "93": 2750.0, + "94": 3397.0, + "95": 3317.0, + "96": 3290.0, + "97": 3116.0, + "98": 3732.0, + "99": 3049.0, + "100": 2974.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 746195456.0, + "52": 746195456.0, + "53": 746195456.0, + "54": 746195456.0, + "55": 746195456.0, + "56": 746195456.0, + "57": 746195456.0, + "58": 746195456.0, + "59": 746195456.0, + "60": 746195456.0, + "61": 746195456.0, + "62": 746195456.0, + "63": 746195456.0, + "64": 746195456.0, + "65": 746195456.0, + "66": 746195456.0, + "67": 746195456.0, + "68": 746195456.0, + "69": 746195456.0, + "70": 746195456.0, + "71": 746195456.0, + "72": 746195456.0, + "73": 746195456.0, + "74": 746195456.0, + "75": 746195456.0, + "76": 746195456.0, + "77": 746195456.0, + "78": 746195456.0, + "79": 746195456.0, + "80": 746195456.0, + "81": 746195456.0, + "82": 746195456.0, + "83": 746195456.0, + "84": 746195456.0, + "85": 746195456.0, + "86": 746195456.0, + "87": 746195456.0, + "88": 746195456.0, + "89": 746195456.0, + "90": 746195456.0, + "91": 746195456.0, + "92": 746195456.0, + "93": 746195456.0, + "94": 746195456.0, + "95": 746195456.0, + "96": 746195456.0, + "97": 746195456.0, + "98": 746195456.0, + "99": 746195456.0, + "100": 746195456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2223482880.0, + "52": 2223483904.0, + "53": 2223483904.0, + "54": 2223483904.0, + "55": 2223483904.0, + "56": 2223483904.0, + "57": 2223483904.0, + "58": 2223483904.0, + "59": 2223483904.0, + "60": 2223483904.0, + "61": 2223483904.0, + "62": 2223483904.0, + "63": 2223483904.0, + "64": 2223483904.0, + "65": 2223483904.0, + "66": 2223483904.0, + "67": 2223483904.0, + "68": 2223483904.0, + "69": 2223483904.0, + "70": 2223483904.0, + "71": 2223483904.0, + "72": 2223483904.0, + "73": 2223483904.0, + "74": 2223483904.0, + "75": 2223483904.0, + "76": 2223483904.0, + "77": 2223483904.0, + "78": 2223483904.0, + "79": 2223483904.0, + "80": 2223483904.0, + "81": 2223483904.0, + "82": 2223483904.0, + "83": 2223483904.0, + "84": 2223483904.0, + "85": 2223483904.0, + "86": 2223483904.0, + "87": 2223483904.0, + "88": 2223483904.0, + "89": 2223483904.0, + "90": 2223483904.0, + "91": 2223483904.0, + "92": 2223483904.0, + "93": 2223483904.0, + "94": 2223483904.0, + "95": 2223483904.0, + "96": 2223483904.0, + "97": 2223483904.0, + "98": 2223483904.0, + "99": 2223483904.0, + "100": 2223483904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 1.94458, + "53": 0.13487, + "54": 0.12133, + "55": 0.12128, + "56": 0.12059, + "57": 0.11937, + "58": 0.11813, + "59": 0.11931, + "60": 0.12225, + "61": 0.1198, + "62": 0.1197, + "63": 0.12083, + "64": 0.12132, + "65": 0.12067, + "66": 0.12047, + "67": 0.12065, + "68": 0.12005, + "69": 0.12047, + "70": 0.11977, + "71": 0.1205, + "72": 0.11909, + "73": 0.11956, + "74": 0.12277, + "75": 0.11982, + "76": 0.12087, + "77": 0.12003, + "78": 0.12188, + "79": 0.12094, + "80": 0.12076, + "81": 0.12072, + "82": 0.12053, + "83": 0.11961, + "84": 0.12306, + "85": 0.12275, + "86": 0.11989, + "87": 0.11996, + "88": 0.1294, + "89": 0.12077, + "90": 0.1204, + "91": 0.12138, + "92": 0.11998, + "93": 0.12202, + "94": 0.12092, + "95": 0.11985, + "96": 0.11995, + "97": 0.12124, + "98": 0.12243, + "99": 0.12016, + "100": 0.12049 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json index 29e5fc62d41..a9a12874e97 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.46714, - "2": 0.15122, - "3": 0.25092, - "4": 0.27729, - "5": 0.24367, - "6": 0.26751, - "7": 0.2545, - "8": 0.12105, - "9": 0.24325, - "10": 0.12174, - "11": 0.12002, - "12": 0.12125, - "13": 0.25244, - "14": 0.11939, - "15": 0.24088, - "16": 0.11887, - "17": 0.44496, - "18": 0.16003, - "19": 0.25508, - "20": 0.21211, - "21": 0.3079, - "22": 0.12004, - "23": 0.43335, - "24": 0.12, - "25": 0.12101, - "26": 0.12096, - "27": 0.29192, - "28": 0.19864, - "29": 0.26692, - "30": 0.11884, - "31": 0.12045, - "32": 0.12079, - "33": 0.12032, - "34": 0.12022, - "35": 0.21894, - "36": 0.11918, - "37": 0.22006, - "38": 0.34871, - "39": 0.12088, - "40": 0.12089, - "41": 0.12159, - "42": 0.18229, - "43": 0.16394, - "44": 0.11984, - "45": 0.12064, - "46": 0.12128, - "47": 0.17743, - "48": 0.1593, - "49": 0.12034, - "50": 0.11831, - "51": 0.13446, - "52": 0.12243, - "53": 0.11866, - "54": 0.11939, - "55": 0.20902, - "56": 0.13705, - "57": 0.11709, - "58": 0.11749, - "59": 0.11871, - "60": 0.22163, - "61": 0.11825, - "62": 0.22086, - "63": 0.11702, - "64": 0.11919, - "65": 0.12009, - "66": 0.19788, - "67": 0.42941, - "68": 0.11868, - "69": 0.22718, - "70": 0.20618, - "71": 0.13003, - "72": 0.134, - "73": 0.13466, - "74": 0.14293, - "75": 0.22299, - "76": 0.12996, - "77": 0.13433, - "78": 0.13652, - "79": 0.1285, - "80": 0.13881, - "81": 0.13014, - "82": 0.12942, - "83": 0.22639, - "84": 0.1185, - "85": 0.22799, - "86": 0.23089, - "87": 0.11774, - "88": 0.22926, - "89": 0.12055, - "90": 0.11828, - "91": 0.25019, - "92": 0.11977, - "93": 0.1173, - "94": 0.11879, - "95": 0.1161, - "96": 0.34968, - "97": 0.11818, - "98": 0.21965, - "99": 0.12107, - "100": 0.11838 + "1": "nan", + "2": 2.22807, + "3": 0.13601, + "4": 0.12128, + "5": 0.1198, + "6": 0.1228, + "7": 0.12056, + "8": 0.11886, + "9": 0.11944, + "10": 0.11995, + "11": 0.11935, + "12": 0.11905, + "13": 0.11975, + "14": 0.12242, + "15": 0.12061, + "16": 0.12046, + "17": 0.1208, + "18": 0.12205, + "19": 0.12427, + "20": 0.12315, + "21": 0.11965, + "22": 0.12231, + "23": 0.12286, + "24": 0.12394, + "25": 0.12377, + "26": 0.12221, + "27": 0.11936, + "28": 0.11894, + "29": 0.11945, + "30": 0.12192, + "31": 0.12571, + "32": 0.12346, + "33": 0.12413, + "34": 0.12225, + "35": 0.12328, + "36": 0.12241, + "37": 0.12432, + "38": 0.12195, + "39": 0.12262, + "40": 0.12198, + "41": 0.12396, + "42": 0.12194, + "43": 0.12435, + "44": 0.12108, + "45": 0.12326, + "46": 0.1218, + "47": 0.12308, + "48": 0.12384, + "49": 0.12795, + "50": 0.12572, + "51": 0.13502, + "52": 0.13106, + "53": 0.14515, + "54": 0.12597, + "55": 0.1249, + "56": 0.12535, + "57": 0.12569, + "58": 0.12489, + "59": 0.12862, + "60": 0.12778, + "61": 0.12731, + "62": 0.12786, + "63": 0.13022, + "64": 0.12789, + "65": 0.12838, + "66": 0.12571, + "67": 0.12651, + "68": 0.12592, + "69": 0.12663, + "70": 0.12691, + "71": 0.12636, + "72": 0.12638, + "73": 0.12671, + "74": 0.12637, + "75": 0.12602, + "76": 0.12598, + "77": 0.12554, + "78": 0.12553, + "79": 0.12501, + "80": 0.13898, + "81": 0.14589, + "82": 0.14718, + "83": 0.14665, + "84": 0.16017, + "85": 0.14231, + "86": 0.15628, + "87": 0.14055, + "88": 0.13961, + "89": 0.14878, + "90": 0.14486, + "91": 0.1432, + "92": 0.14946, + "93": 0.14581, + "94": 0.1623, + "95": 0.15638, + "96": 0.12895, + "97": 0.12907, + "98": 0.12824, + "99": 0.12741, + "100": 0.12543 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..8cb40558f2c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.83807, + "52": 9.72058, + "53": 10.0568, + "54": 9.95032, + "55": 9.88328, + "56": 9.60431, + "57": 9.45518, + "58": 9.81927, + "59": 9.58262, + "60": 9.48844, + "61": 9.68577, + "62": 9.97779, + "63": 9.36765, + "64": 9.75913, + "65": 8.9376, + "66": 9.69257, + "67": 9.36621, + "68": 9.78303, + "69": 9.79318, + "70": 9.72699, + "71": 9.62875, + "72": 9.58004, + "73": 9.487, + "74": 8.92041, + "75": 9.41128, + "76": 9.07564, + "77": 10.05848, + "78": 9.72184, + "79": 9.3732, + "80": 9.40079, + "81": 9.4792, + "82": 9.69754, + "83": 9.31037, + "84": 9.41777, + "85": 9.61194, + "86": 9.07155, + "87": 9.59661, + "88": 9.74709, + "89": 9.59667, + "90": 9.82915, + "91": 9.33725, + "92": 9.3564, + "93": 9.08552, + "94": 8.82807, + "95": 9.52842, + "96": 9.52611, + "97": 9.30632, + "98": 9.66808, + "99": 8.89461, + "100": 9.40666 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2546.0, + "52": 2590.0, + "53": 2879.0, + "54": 2697.0, + "55": 2316.0, + "56": 2549.0, + "57": 2261.0, + "58": 2904.0, + "59": 2740.0, + "60": 2434.0, + "61": 2801.0, + "62": 2663.0, + "63": 2502.0, + "64": 2948.0, + "65": 2644.0, + "66": 2961.0, + "67": 2813.0, + "68": 2686.0, + "69": 2912.0, + "70": 3096.0, + "71": 2854.0, + "72": 2454.0, + "73": 3081.0, + "74": 1933.0, + "75": 2465.0, + "76": 3012.0, + "77": 3163.0, + "78": 2997.0, + "79": 3089.0, + "80": 3187.0, + "81": 3500.0, + "82": 3339.0, + "83": 2705.0, + "84": 3205.0, + "85": 3033.0, + "86": 2818.0, + "87": 3671.0, + "88": 3190.0, + "89": 3336.0, + "90": 3320.0, + "91": 2698.0, + "92": 3072.0, + "93": 2750.0, + "94": 3397.0, + "95": 3317.0, + "96": 3290.0, + "97": 3116.0, + "98": 3732.0, + "99": 3049.0, + "100": 2974.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 746195456.0, + "52": 746195456.0, + "53": 746195456.0, + "54": 746195456.0, + "55": 746195456.0, + "56": 746195456.0, + "57": 746195456.0, + "58": 746195456.0, + "59": 746195456.0, + "60": 746195456.0, + "61": 746195456.0, + "62": 746195456.0, + "63": 746195456.0, + "64": 746195456.0, + "65": 746195456.0, + "66": 746195456.0, + "67": 746195456.0, + "68": 746195456.0, + "69": 746195456.0, + "70": 746195456.0, + "71": 746195456.0, + "72": 746195456.0, + "73": 746195456.0, + "74": 746195456.0, + "75": 746195456.0, + "76": 746195456.0, + "77": 746195456.0, + "78": 746195456.0, + "79": 746195456.0, + "80": 746195456.0, + "81": 746195456.0, + "82": 746195456.0, + "83": 746195456.0, + "84": 746195456.0, + "85": 746195456.0, + "86": 746195456.0, + "87": 746195456.0, + "88": 746195456.0, + "89": 746195456.0, + "90": 746195456.0, + "91": 746195456.0, + "92": 746195456.0, + "93": 746195456.0, + "94": 746195456.0, + "95": 746195456.0, + "96": 746195456.0, + "97": 746195456.0, + "98": 746195456.0, + "99": 746195456.0, + "100": 746195456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2223482880.0, + "52": 2223483904.0, + "53": 2223483904.0, + "54": 2223483904.0, + "55": 2223483904.0, + "56": 2223483904.0, + "57": 2223483904.0, + "58": 2223483904.0, + "59": 2223483904.0, + "60": 2223483904.0, + "61": 2223483904.0, + "62": 2223483904.0, + "63": 2223483904.0, + "64": 2223483904.0, + "65": 2223483904.0, + "66": 2223483904.0, + "67": 2223483904.0, + "68": 2223483904.0, + "69": 2223483904.0, + "70": 2223483904.0, + "71": 2223483904.0, + "72": 2223483904.0, + "73": 2223483904.0, + "74": 2223483904.0, + "75": 2223483904.0, + "76": 2223483904.0, + "77": 2223483904.0, + "78": 2223483904.0, + "79": 2223483904.0, + "80": 2223483904.0, + "81": 2223483904.0, + "82": 2223483904.0, + "83": 2223483904.0, + "84": 2223483904.0, + "85": 2223483904.0, + "86": 2223483904.0, + "87": 2223483904.0, + "88": 2223483904.0, + "89": 2223483904.0, + "90": 2223483904.0, + "91": 2223483904.0, + "92": 2223483904.0, + "93": 2223483904.0, + "94": 2223483904.0, + "95": 2223483904.0, + "96": 2223483904.0, + "97": 2223483904.0, + "98": 2223483904.0, + "99": 2223483904.0, + "100": 2223483904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.08357, + "53": 0.13321, + "54": 0.11949, + "55": 0.11861, + "56": 0.11817, + "57": 0.12088, + "58": 0.11937, + "59": 0.11893, + "60": 0.11961, + "61": 0.11894, + "62": 0.11953, + "63": 0.11978, + "64": 0.11983, + "65": 0.12255, + "66": 0.12188, + "67": 0.12135, + "68": 0.11972, + "69": 0.11963, + "70": 0.11929, + "71": 0.11924, + "72": 0.12023, + "73": 0.12093, + "74": 0.12082, + "75": 0.11862, + "76": 0.11797, + "77": 0.11862, + "78": 0.12219, + "79": 0.12137, + "80": 0.11873, + "81": 0.11752, + "82": 0.1208, + "83": 0.11974, + "84": 0.1182, + "85": 0.11721, + "86": 0.11748, + "87": 0.11944, + "88": 0.11934, + "89": 0.11847, + "90": 0.11837, + "91": 0.11938, + "92": 0.11761, + "93": 0.11737, + "94": 0.12142, + "95": 0.12574, + "96": 0.12197, + "97": 0.12384, + "98": 0.12251, + "99": 0.13032, + "100": 0.12305 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json index 6d18d551f69..1fc5ef869c5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 9.70264, - "2": 0.16719, - "3": 0.1517, - "4": 0.13783, - "5": 0.26129, - "6": 0.13706, - "7": 0.13419, - "8": 0.23253, - "9": 0.27748, - "10": 0.13541, - "11": 0.2497, - "12": 0.16837, - "13": 0.18244, - "14": 0.25112, - "15": 0.13528, - "16": 0.13665, - "17": 0.1335, - "18": 0.24242, - "19": 0.13551, - "20": 0.1359, - "21": 0.23117, - "22": 0.23904, - "23": 0.14673, - "24": 0.21295, - "25": 0.13514, - "26": 0.13371, - "27": 0.27353, - "28": 0.13711, - "29": 0.13562, - "30": 0.14989, - "31": 0.13559, - "32": 0.25304, - "33": 0.13594, - "34": 0.23626, - "35": 0.21619, - "36": 0.13222, - "37": 0.22334, - "38": 0.17132, - "39": 0.13473, - "40": 0.13527, - "41": 0.13612, - "42": 0.13601, - "43": 0.13671, - "44": 0.13525, - "45": 0.13595, - "46": 0.13781, - "47": 0.13561, - "48": 0.21607, - "49": 0.13778, - "50": 0.13576, - "51": 0.15841, - "52": 0.19731, - "53": 0.13535, - "54": 0.13412, - "55": 0.13529, - "56": 0.20892, - "57": 0.136, - "58": 0.13447, - "59": 0.13492, - "60": 0.22138, - "61": 0.1371, - "62": 0.13221, - "63": 0.31035, - "64": 0.13635, - "65": 0.18383, - "66": 0.13523, - "67": 0.21619, - "68": 0.13406, - "69": 0.24552, - "70": 0.13459, - "71": 0.24237, - "72": 0.13438, - "73": 0.13314, - "74": 0.2234, - "75": 0.13466, - "76": 0.13379, - "77": 0.23131, - "78": 0.13685, - "79": 0.2198, - "80": 0.13574, - "81": 0.13541, - "82": 0.24005, - "83": 0.13618, - "84": 0.13532, - "85": 0.13462, - "86": 0.13568, - "87": 0.13402, - "88": 0.22458, - "89": 0.13468, - "90": 0.23352, - "91": 0.14917, - "92": 0.14938, - "93": 0.14799, - "94": 0.23609, - "95": 0.15009, - "96": 0.22721, - "97": 0.15604, - "98": 0.22921, - "99": 0.1552, - "100": 0.15308 + "1": "nan", + "2": 2.5568, + "3": 0.14788, + "4": 0.13602, + "5": 0.13596, + "6": 0.136, + "7": 0.13621, + "8": 0.13502, + "9": 0.13408, + "10": 0.23083, + "11": 0.14377, + "12": 0.14332, + "13": 0.15453, + "14": 0.15537, + "15": 0.15549, + "16": 0.15444, + "17": 0.15453, + "18": 0.15178, + "19": 0.21432, + "20": 0.15336, + "21": 0.1534, + "22": 0.15483, + "23": 0.15395, + "24": 0.15469, + "25": 0.15447, + "26": 0.15509, + "27": 0.1545, + "28": 0.15527, + "29": 0.15593, + "30": 0.15688, + "31": 0.15659, + "32": 0.15629, + "33": 0.15533, + "34": 0.155, + "35": 0.15519, + "36": 0.15784, + "37": 0.15943, + "38": 0.15552, + "39": 0.15486, + "40": 0.15539, + "41": 0.15618, + "42": 0.15569, + "43": 0.15725, + "44": 0.15522, + "45": 0.1553, + "46": 0.15719, + "47": 0.15571, + "48": 0.15568, + "49": 0.15362, + "50": 0.15495, + "51": 0.18287, + "52": 0.16115, + "53": 0.15739, + "54": 0.15665, + "55": 0.15684, + "56": 0.15658, + "57": 0.15631, + "58": 0.22153, + "59": 0.15604, + "60": 0.15313, + "61": 0.15485, + "62": 0.15518, + "63": 0.15719, + "64": 0.15757, + "65": 0.15904, + "66": 0.15846, + "67": 0.15846, + "68": 0.15754, + "69": 0.15779, + "70": 0.1589, + "71": 0.16037, + "72": 0.15778, + "73": 0.15771, + "74": 0.155, + "75": 0.15611, + "76": 0.15702, + "77": 0.15564, + "78": 0.15892, + "79": 0.15669, + "80": 0.15768, + "81": 0.15805, + "82": 0.15778, + "83": 0.15674, + "84": 0.15715, + "85": 0.15834, + "86": 0.15763, + "87": 0.15855, + "88": 0.15589, + "89": 0.15616, + "90": 0.15639, + "91": 0.15722, + "92": 0.15788, + "93": 0.15597, + "94": 0.15817, + "95": 0.15819, + "96": 0.15869, + "97": 0.15875, + "98": 0.15993, + "99": 0.16297, + "100": 0.16682 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..2cf6e26ff95 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.9812, + "52": 9.89198, + "53": 10.19208, + "54": 10.09574, + "55": 10.00506, + "56": 9.78714, + "57": 9.64607, + "58": 9.9862, + "59": 9.72684, + "60": 9.67172, + "61": 9.80984, + "62": 10.11126, + "63": 9.54877, + "64": 9.90929, + "65": 9.08735, + "66": 9.84659, + "67": 9.48264, + "68": 9.89439, + "69": 9.87695, + "70": 9.82469, + "71": 9.72751, + "72": 9.72911, + "73": 9.62051, + "74": 9.11601, + "75": 9.55057, + "76": 9.21504, + "77": 10.14893, + "78": 9.8138, + "79": 9.47515, + "80": 9.51582, + "81": 9.58685, + "82": 9.79026, + "83": 9.45587, + "84": 9.50503, + "85": 9.71387, + "86": 9.17463, + "87": 9.66601, + "88": 9.84354, + "89": 9.70734, + "90": 9.8955, + "91": 9.48652, + "92": 9.47023, + "93": 9.21481, + "94": 8.94327, + "95": 9.6154, + "96": 9.63634, + "97": 9.37644, + "98": 9.74975, + "99": 9.01753, + "100": 9.50515 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2676.0, + "52": 2581.0, + "53": 2898.0, + "54": 2849.0, + "55": 2548.0, + "56": 2661.0, + "57": 2510.0, + "58": 2758.0, + "59": 2650.0, + "60": 2242.0, + "61": 2628.0, + "62": 2899.0, + "63": 2605.0, + "64": 2939.0, + "65": 2572.0, + "66": 2896.0, + "67": 2640.0, + "68": 2709.0, + "69": 2889.0, + "70": 3012.0, + "71": 2978.0, + "72": 2536.0, + "73": 2964.0, + "74": 2163.0, + "75": 2603.0, + "76": 2974.0, + "77": 3007.0, + "78": 3138.0, + "79": 3197.0, + "80": 2984.0, + "81": 3280.0, + "82": 3341.0, + "83": 2757.0, + "84": 3399.0, + "85": 3320.0, + "86": 2882.0, + "87": 3407.0, + "88": 3278.0, + "89": 3336.0, + "90": 3322.0, + "91": 2472.0, + "92": 3061.0, + "93": 2911.0, + "94": 3005.0, + "95": 2984.0, + "96": 2991.0, + "97": 3178.0, + "98": 3343.0, + "99": 2929.0, + "100": 2588.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 744880640.0, + "52": 744880640.0, + "53": 744880640.0, + "54": 744880640.0, + "55": 744880640.0, + "56": 744880640.0, + "57": 744880640.0, + "58": 744880640.0, + "59": 744880640.0, + "60": 744880640.0, + "61": 744880640.0, + "62": 744880640.0, + "63": 744880640.0, + "64": 744880640.0, + "65": 744880640.0, + "66": 744880640.0, + "67": 744880640.0, + "68": 744880640.0, + "69": 744880640.0, + "70": 744880640.0, + "71": 744880640.0, + "72": 744880640.0, + "73": 744880640.0, + "74": 744880640.0, + "75": 744880640.0, + "76": 744880640.0, + "77": 744880640.0, + "78": 744880640.0, + "79": 744880640.0, + "80": 744880640.0, + "81": 744880640.0, + "82": 744880640.0, + "83": 744880640.0, + "84": 744880640.0, + "85": 744880640.0, + "86": 744880640.0, + "87": 744880640.0, + "88": 744880640.0, + "89": 744880640.0, + "90": 744880640.0, + "91": 744880640.0, + "92": 744880640.0, + "93": 744880640.0, + "94": 744880640.0, + "95": 744880640.0, + "96": 744880640.0, + "97": 744880640.0, + "98": 744880640.0, + "99": 744880640.0, + "100": 744880640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2222430208.0, + "52": 2222431232.0, + "53": 2222431232.0, + "54": 2222431232.0, + "55": 2222431232.0, + "56": 2222431232.0, + "57": 2222431232.0, + "58": 2222431232.0, + "59": 2222431232.0, + "60": 2222431232.0, + "61": 2222431232.0, + "62": 2222431232.0, + "63": 2222431232.0, + "64": 2222431232.0, + "65": 2222431232.0, + "66": 2222431232.0, + "67": 2222431232.0, + "68": 2222431232.0, + "69": 2222431232.0, + "70": 2222431232.0, + "71": 2222431232.0, + "72": 2222431232.0, + "73": 2222431232.0, + "74": 2222431232.0, + "75": 2222431232.0, + "76": 2222431232.0, + "77": 2222431232.0, + "78": 2222431232.0, + "79": 2222431232.0, + "80": 2222431232.0, + "81": 2222431232.0, + "82": 2222431232.0, + "83": 2222431232.0, + "84": 2222431232.0, + "85": 2222431232.0, + "86": 2222431232.0, + "87": 2222431232.0, + "88": 2222431232.0, + "89": 2222431232.0, + "90": 2222431232.0, + "91": 2222431232.0, + "92": 2222431232.0, + "93": 2222431232.0, + "94": 2222431232.0, + "95": 2222431232.0, + "96": 2222431232.0, + "97": 2222431232.0, + "98": 2222431232.0, + "99": 2222431232.0, + "100": 2222431232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.21538, + "53": 0.14615, + "54": 0.13599, + "55": 0.13518, + "56": 0.13401, + "57": 0.13944, + "58": 0.13509, + "59": 0.1377, + "60": 0.13698, + "61": 0.137, + "62": 0.13756, + "63": 0.14119, + "64": 0.13937, + "65": 0.13725, + "66": 0.13667, + "67": 0.13894, + "68": 0.13705, + "69": 0.1375, + "70": 0.13655, + "71": 0.13624, + "72": 0.13743, + "73": 0.13786, + "74": 0.13678, + "75": 0.13803, + "76": 0.13591, + "77": 0.13654, + "78": 0.13783, + "79": 0.13724, + "80": 0.13943, + "81": 0.13808, + "82": 0.13899, + "83": 0.13956, + "84": 0.14004, + "85": 0.14504, + "86": 0.14078, + "87": 0.14075, + "88": 0.14222, + "89": 0.14283, + "90": 0.14178, + "91": 0.14143, + "92": 0.14178, + "93": 0.14108, + "94": 0.14248, + "95": 0.14123, + "96": 0.14274, + "97": 0.14429, + "98": 0.14312, + "99": 0.14121, + "100": 0.14248 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json index 87eebe31670..34725e2965a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 8.41422, - "2": 0.18257, - "3": 0.66774, - "4": 0.24561, - "5": 0.26628, - "6": 0.28507, - "7": 0.15561, - "8": 0.31346, - "9": 0.1544, - "10": 0.23888, - "11": 0.24945, - "12": 0.15494, - "13": 0.20224, - "14": 0.15018, - "15": 0.21414, - "16": 0.15279, - "17": 0.15714, - "18": 0.16051, - "19": 0.23287, - "20": 0.17277, - "21": 0.18416, - "22": 0.18504, - "23": 0.22706, - "24": 0.17428, - "25": 0.15714, - "26": 0.24051, - "27": 0.16163, - "28": 0.15307, - "29": 0.15547, - "30": 0.15066, - "31": 0.18968, - "32": 0.20133, - "33": 0.15407, - "34": 0.15375, - "35": 0.22411, - "36": 0.1654, - "37": 0.23902, - "38": 0.15259, - "39": 0.15371, - "40": 0.15185, - "41": 0.21089, - "42": 0.15272, - "43": 0.21496, - "44": 0.15539, - "45": 0.15507, - "46": 0.1557, - "47": 0.15641, - "48": 0.15434, - "49": 0.15017, - "50": 0.23326, - "51": 0.17863, - "52": 0.15471, - "53": 0.1511, - "54": 0.1513, - "55": 0.14791, - "56": 0.23169, - "57": 0.15152, - "58": 0.27611, - "59": 0.15101, - "60": 0.15075, - "61": 0.15095, - "62": 0.15099, - "63": 0.40681, - "64": 0.15196, - "65": 0.4085, - "66": 0.15392, - "67": 0.15079, - "68": 0.18374, - "69": 0.16595, - "70": 0.17343, - "71": 0.2083, - "72": 0.23324, - "73": 0.17579, - "74": 0.2442, - "75": 0.15263, - "76": 0.15001, - "77": 0.14836, - "78": 0.22649, - "79": 0.15368, - "80": 0.15125, - "81": 0.15382, - "82": 0.15532, - "83": 0.1536, - "84": 0.15494, - "85": 0.1516, - "86": 0.2253, - "87": 0.1656, - "88": 0.16481, - "89": 0.16686, - "90": 0.19956, - "91": 0.15647, - "92": 0.15231, - "93": 0.15013, - "94": 0.22716, - "95": 0.15151, - "96": 0.15158, - "97": 0.21549, - "98": 0.15054, - "99": 0.16863, - "100": 0.15247 + "1": "nan", + "2": 2.33774, + "3": 0.14705, + "4": 0.13192, + "5": 0.13017, + "6": 0.1292, + "7": 0.13024, + "8": 0.13032, + "9": 0.12858, + "10": 0.12948, + "11": 0.12888, + "12": 0.12888, + "13": 0.12811, + "14": 0.12943, + "15": 0.12948, + "16": 0.1295, + "17": 0.13022, + "18": 0.12847, + "19": 0.12992, + "20": 0.1308, + "21": 0.12844, + "22": 0.13063, + "23": 0.13033, + "24": 0.13003, + "25": 0.12935, + "26": 0.13016, + "27": 0.12989, + "28": 0.12947, + "29": 0.12857, + "30": 0.12949, + "31": 0.12997, + "32": 0.12843, + "33": 0.1291, + "34": 0.12894, + "35": 0.13061, + "36": 0.12974, + "37": 0.12939, + "38": 0.13039, + "39": 0.13034, + "40": 0.13069, + "41": 0.13259, + "42": 0.13109, + "43": 0.13211, + "44": 0.1299, + "45": 0.1295, + "46": 0.13001, + "47": 0.13037, + "48": 0.13043, + "49": 0.13012, + "50": 0.12915, + "51": 0.14665, + "52": 0.12869, + "53": 0.12717, + "54": 0.12709, + "55": 0.12611, + "56": 0.12645, + "57": 0.12711, + "58": 0.12728, + "59": 0.1269, + "60": 0.12701, + "61": 0.1281, + "62": 0.12781, + "63": 0.12842, + "64": 0.12745, + "65": 0.12897, + "66": 0.12786, + "67": 0.12983, + "68": 0.13068, + "69": 0.1284, + "70": 0.12896, + "71": 0.1288, + "72": 0.13026, + "73": 0.13011, + "74": 0.12891, + "75": 0.12798, + "76": 0.12866, + "77": 0.12994, + "78": 0.12957, + "79": 0.12765, + "80": 0.12884, + "81": 0.12898, + "82": 0.12927, + "83": 0.12848, + "84": 0.12845, + "85": 0.12849, + "86": 0.12983, + "87": 0.1303, + "88": 0.12961, + "89": 0.13093, + "90": 0.12951, + "91": 0.12818, + "92": 0.12902, + "93": 0.12967, + "94": 0.13419, + "95": 0.14029, + "96": 0.1429, + "97": 0.14018, + "98": 0.13632, + "99": 0.14128, + "100": 0.14034 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..dd354f801de --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.90259, + "52": 9.79281, + "53": 10.11536, + "54": 9.99216, + "55": 9.91665, + "56": 9.66015, + "57": 9.52038, + "58": 9.87094, + "59": 9.6209, + "60": 9.54952, + "61": 9.70012, + "62": 10.00629, + "63": 9.42168, + "64": 9.79893, + "65": 8.97548, + "66": 9.73165, + "67": 9.38933, + "68": 9.80066, + "69": 9.81152, + "70": 9.76761, + "71": 9.63356, + "72": 9.59892, + "73": 9.51708, + "74": 8.96512, + "75": 9.43589, + "76": 9.11207, + "77": 10.06881, + "78": 9.72515, + "79": 9.39985, + "80": 9.41154, + "81": 9.50094, + "82": 9.69861, + "83": 9.33578, + "84": 9.4341, + "85": 9.63907, + "86": 9.06166, + "87": 9.60563, + "88": 9.77626, + "89": 9.6243, + "90": 9.82766, + "91": 9.35869, + "92": 9.38066, + "93": 9.09681, + "94": 8.83995, + "95": 9.52751, + "96": 9.53562, + "97": 9.32689, + "98": 9.69354, + "99": 8.88933, + "100": 9.42104 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 22791436.0, + "52": 22748292.0, + "53": 22924772.0, + "54": 22840284.0, + "55": 22517880.0, + "56": 22877730.0, + "57": 23113080.0, + "58": 22845568.0, + "59": 22716022.0, + "60": 22743056.0, + "61": 22724434.0, + "62": 22672316.0, + "63": 22846416.0, + "64": 22823178.0, + "65": 23061654.0, + "66": 22729712.0, + "67": 22908434.0, + "68": 22610444.0, + "69": 22584604.0, + "70": 22828526.0, + "71": 22748442.0, + "72": 22655052.0, + "73": 22740588.0, + "74": 23048316.0, + "75": 23054664.0, + "76": 22901072.0, + "77": 22272198.0, + "78": 22789244.0, + "79": 22743700.0, + "80": 22706576.0, + "81": 22890704.0, + "82": 22778282.0, + "83": 22840256.0, + "84": 23010368.0, + "85": 22711796.0, + "86": 23103236.0, + "87": 22735120.0, + "88": 22636998.0, + "89": 22498612.0, + "90": 22972652.0, + "91": 22767776.0, + "92": 22809424.0, + "93": 22658980.0, + "94": 22911920.0, + "95": 23047890.0, + "96": 22828804.0, + "97": 22608196.0, + "98": 22762820.0, + "99": 22906714.0, + "100": 23016048.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 746444288.0, + "52": 746444288.0, + "53": 746444288.0, + "54": 746444288.0, + "55": 746444288.0, + "56": 746444288.0, + "57": 746444288.0, + "58": 746444288.0, + "59": 746444288.0, + "60": 746444288.0, + "61": 746444288.0, + "62": 746444288.0, + "63": 746444288.0, + "64": 746444288.0, + "65": 746444288.0, + "66": 746444288.0, + "67": 746444288.0, + "68": 746444288.0, + "69": 746444288.0, + "70": 746444288.0, + "71": 746444288.0, + "72": 746444288.0, + "73": 746444288.0, + "74": 746444288.0, + "75": 746444288.0, + "76": 746444288.0, + "77": 746444288.0, + "78": 746444288.0, + "79": 746444288.0, + "80": 746444288.0, + "81": 746444288.0, + "82": 746444288.0, + "83": 746444288.0, + "84": 746444288.0, + "85": 746444288.0, + "86": 746444288.0, + "87": 746444288.0, + "88": 746444288.0, + "89": 746444288.0, + "90": 746444288.0, + "91": 746444288.0, + "92": 746444288.0, + "93": 746444288.0, + "94": 746444288.0, + "95": 746444288.0, + "96": 746444288.0, + "97": 746444288.0, + "98": 746444288.0, + "99": 746444288.0, + "100": 746444288.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2223731712.0, + "52": 2223732736.0, + "53": 2223732736.0, + "54": 2223732736.0, + "55": 2223732736.0, + "56": 2223732736.0, + "57": 2223732736.0, + "58": 2223732736.0, + "59": 2223732736.0, + "60": 2223732736.0, + "61": 2223732736.0, + "62": 2223732736.0, + "63": 2223732736.0, + "64": 2223732736.0, + "65": 2223732736.0, + "66": 2223732736.0, + "67": 2223732736.0, + "68": 2223732736.0, + "69": 2223732736.0, + "70": 2223732736.0, + "71": 2223732736.0, + "72": 2223732736.0, + "73": 2223732736.0, + "74": 2223732736.0, + "75": 2223732736.0, + "76": 2223732736.0, + "77": 2223732736.0, + "78": 2223732736.0, + "79": 2223732736.0, + "80": 2223732736.0, + "81": 2223732736.0, + "82": 2223732736.0, + "83": 2223732736.0, + "84": 2223732736.0, + "85": 2223732736.0, + "86": 2223732736.0, + "87": 2223732736.0, + "88": 2223732736.0, + "89": 2223732736.0, + "90": 2223732736.0, + "91": 2223732736.0, + "92": 2223732736.0, + "93": 2223732736.0, + "94": 2223732736.0, + "95": 2223732736.0, + "96": 2223732736.0, + "97": 2223732736.0, + "98": 2223732736.0, + "99": 2223732736.0, + "100": 2223732736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.28424, + "53": 0.15724, + "54": 0.14436, + "55": 0.14133, + "56": 0.14939, + "57": 0.15152, + "58": 0.16555, + "59": 0.19478, + "60": 0.13288, + "61": 0.13086, + "62": 0.13088, + "63": 0.13074, + "64": 0.1303, + "65": 0.13189, + "66": 0.13138, + "67": 0.12968, + "68": 0.13118, + "69": 0.13064, + "70": 0.12931, + "71": 0.12915, + "72": 0.12915, + "73": 0.13375, + "74": 0.13641, + "75": 0.13586, + "76": 0.13551, + "77": 0.13604, + "78": 0.13931, + "79": 0.13798, + "80": 0.13724, + "81": 0.13702, + "82": 0.13663, + "83": 0.1357, + "84": 0.13618, + "85": 0.13577, + "86": 0.13569, + "87": 0.13635, + "88": 0.13659, + "89": 0.13724, + "90": 0.13599, + "91": 0.13637, + "92": 0.13565, + "93": 0.13693, + "94": 0.13576, + "95": 0.13566, + "96": 0.13579, + "97": 0.13592, + "98": 0.13631, + "99": 0.13476, + "100": 0.13606 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json index 06040458828..2610b7fe2f4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 9.05034, - "2": 0.14876, - "3": 0.14285, - "4": 0.13033, - "5": 0.24651, - "6": 0.19893, - "7": 0.15924, - "8": 0.11963, - "9": 0.12767, - "10": 0.24283, - "11": 0.12856, - "12": 0.13101, - "13": 0.5056, - "14": 0.1222, - "15": 0.23869, - "16": 0.13294, - "17": 0.13193, - "18": 0.14163, - "19": 0.13647, - "20": 0.2257, - "21": 0.13437, - "22": 0.24393, - "23": 0.13446, - "24": 0.23274, - "25": 0.14725, - "26": 0.13804, - "27": 0.14255, - "28": 0.14086, - "29": 0.23437, - "30": 0.25225, - "31": 0.13433, - "32": 0.25099, - "33": 0.14422, - "34": 0.20638, - "35": 0.13575, - "36": 0.13592, - "37": 0.14521, - "38": 0.9985, - "39": 0.14828, - "40": 0.13964, - "41": 0.13609, - "42": 0.33948, - "43": 0.13414, - "44": 0.27111, - "45": 0.14576, - "46": 0.13882, - "47": 0.13432, - "48": 0.14571, - "49": 0.14535, - "50": 0.4444 + "1": "nan", + "2": 2.94258, + "3": 0.12978, + "4": 0.11688, + "5": 0.11937, + "6": 0.12093, + "7": 0.12307, + "8": 0.13062, + "9": 0.12926, + "10": 0.1228, + "11": 0.12859, + "12": 0.12404, + "13": 0.12912, + "14": 0.12318, + "15": 0.12609, + "16": 0.13327, + "17": 0.12859, + "18": 0.12957, + "19": 0.12658, + "20": 0.12929, + "21": 0.12937, + "22": 0.1298, + "23": 0.12888, + "24": 0.12917, + "25": 0.1285, + "26": 0.12864, + "27": 0.13061, + "28": 0.1272, + "29": 0.12953, + "30": 0.12693, + "31": 0.13141, + "32": 0.12786, + "33": 0.12815, + "34": 0.12937, + "35": 0.12957, + "36": 0.12737, + "37": 0.1313, + "38": 0.12977, + "39": 0.12805, + "40": 0.1298, + "41": 0.1296, + "42": 0.13074, + "43": 0.12955, + "44": 0.13171, + "45": 0.13055, + "46": 0.13271, + "47": 0.13004, + "48": 0.12873, + "49": 0.13129, + "50": 0.12858 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json index 110646cd819..10988c85257 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 554160640.0, - "2": 555209216.0, - "3": 554160640.0, - "4": 554160640.0, - "5": 554160640.0, - "6": 554160640.0, - "7": 554160640.0, - "8": 554160640.0, - "9": 555209216.0, - "10": 554160640.0, - "11": 554160640.0, - "12": 554160640.0, - "13": 554160640.0, - "14": 554160640.0, - "15": 554160640.0, - "16": 554160640.0, - "17": 554160640.0, - "18": 554160640.0, - "19": 554160640.0, - "20": 554160640.0, - "21": 554160640.0, - "22": 554160640.0, - "23": 554160640.0, - "24": 554160640.0, - "25": 554160640.0, - "26": 554160640.0, - "27": 554160640.0, - "28": 554160640.0, - "29": 554160640.0, - "30": 554160640.0, - "31": 554160640.0, - "32": 554160640.0, - "33": 554160640.0, - "34": 554160640.0, - "35": 554160640.0, - "36": 554160640.0, - "37": 554160640.0, - "38": 554160640.0, - "39": 554160640.0, - "40": 554160640.0, - "41": 554160640.0, - "42": 555209216.0, - "43": 554160640.0, - "44": 554160640.0, - "45": 554160640.0, - "46": 554160640.0, - "47": 554160640.0, - "48": 554160640.0, - "49": 554160640.0, - "50": 554160640.0 + "1": 552193536.0, + "2": 552193536.0, + "3": 553242112.0, + "4": 553242112.0, + "5": 552193536.0, + "6": 553242112.0, + "7": 553242112.0, + "8": 553242112.0, + "9": 553242112.0, + "10": 553242112.0, + "11": 553242112.0, + "12": 552193536.0, + "13": 552193536.0, + "14": 552193536.0, + "15": 552193536.0, + "16": 553242112.0, + "17": 553242112.0, + "18": 552193536.0, + "19": 553242112.0, + "20": 553242112.0, + "21": 553242112.0, + "22": 552193536.0, + "23": 553242112.0, + "24": 553242112.0, + "25": 553242112.0, + "26": 553242112.0, + "27": 553242112.0, + "28": 553242112.0, + "29": 553242112.0, + "30": 553242112.0, + "31": 552193536.0, + "32": 552193536.0, + "33": 553242112.0, + "34": 553242112.0, + "35": 552193536.0, + "36": 553242112.0, + "37": 552193536.0, + "38": 552193536.0, + "39": 552193536.0, + "40": 552193536.0, + "41": 552193536.0, + "42": 552193536.0, + "43": 552193536.0, + "44": 552193536.0, + "45": 552193536.0, + "46": 552193536.0, + "47": 552193536.0, + "48": 552193536.0, + "49": 552193536.0, + "50": 553242112.0 } }, "mem-max-allocated-bytes": { @@ -176,55 +176,55 @@ "step_interval": 1, "values": { "1": 3798208000.0, - "2": 3944053248.0, - "3": 3944053248.0, - "4": 3944053248.0, - "5": 3944053248.0, - "6": 3944053248.0, - "7": 3944053248.0, - "8": 3944053248.0, - "9": 3944053248.0, - "10": 3944053248.0, - "11": 3944053248.0, - "12": 3944053248.0, - "13": 3944053248.0, - "14": 3944053248.0, - "15": 3944053248.0, - "16": 3944053248.0, - "17": 3944053248.0, - "18": 3944053248.0, - "19": 3944053248.0, - "20": 3944053248.0, - "21": 3944053248.0, - "22": 3944053248.0, - "23": 3944053248.0, - "24": 3944053248.0, - "25": 3944053248.0, - "26": 3944053248.0, - "27": 3944053248.0, - "28": 3944053248.0, - "29": 3944053248.0, - "30": 3944053248.0, - "31": 3944053248.0, - "32": 3944053248.0, - "33": 3944053248.0, - "34": 3944053248.0, - "35": 3944053248.0, - "36": 3944053248.0, - "37": 3944053248.0, - "38": 3944053248.0, - "39": 3944053248.0, - "40": 3944053248.0, - "41": 3944053248.0, - "42": 3944053248.0, - "43": 3944053248.0, - "44": 3944053248.0, - "45": 3944053248.0, - "46": 3944053248.0, - "47": 3944053248.0, - "48": 3944053248.0, - "49": 3944053248.0, - "50": 3944053248.0 + "2": 3942086144.0, + "3": 3942086144.0, + "4": 3942086144.0, + "5": 3942086144.0, + "6": 3942086144.0, + "7": 3942086144.0, + "8": 3942086144.0, + "9": 3942086144.0, + "10": 3942086144.0, + "11": 3942086144.0, + "12": 3942086144.0, + "13": 3942086144.0, + "14": 3942086144.0, + "15": 3942086144.0, + "16": 3942086144.0, + "17": 3942086144.0, + "18": 3942086144.0, + "19": 3942086144.0, + "20": 3942086144.0, + "21": 3942086144.0, + "22": 3942086144.0, + "23": 3942086144.0, + "24": 3942086144.0, + "25": 3942086144.0, + "26": 3942086144.0, + "27": 3942086144.0, + "28": 3942086144.0, + "29": 3942086144.0, + "30": 3942086144.0, + "31": 3942086144.0, + "32": 3942086144.0, + "33": 3942086144.0, + "34": 3942086144.0, + "35": 3942086144.0, + "36": 3942086144.0, + "37": 3942086144.0, + "38": 3942086144.0, + "39": 3942086144.0, + "40": 3942086144.0, + "41": 3942086144.0, + "42": 3942086144.0, + "43": 3942086144.0, + "44": 3942086144.0, + "45": 3942086144.0, + "46": 3942086144.0, + "47": 3942086144.0, + "48": 3942086144.0, + "49": 3942086144.0, + "50": 3942086144.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7.76857, - "2": 0.14325, - "3": 0.13072, - "4": 0.11885, - "5": 0.11896, - "6": 0.1181, - "7": 0.11917, - "8": 0.11807, - "9": 0.11852, - "10": 0.11869, - "11": 0.21274, - "12": 0.11744, - "13": 0.11909, - "14": 0.12072, - "15": 0.11937, - "16": 0.11875, - "17": 0.11813, - "18": 0.117, - "19": 0.11808, - "20": 0.1185, - "21": 0.21315, - "22": 0.11941, - "23": 0.11829, - "24": 0.12018, - "25": 0.11873, - "26": 0.12277, - "27": 0.11624, - "28": 0.11801, - "29": 0.11768, - "30": 0.11811, - "31": 0.21259, - "32": 0.11823, - "33": 0.11857, - "34": 0.11893, - "35": 0.12121, - "36": 0.11984, - "37": 0.12002, - "38": 0.11889, - "39": 0.12151, - "40": 0.11884, - "41": 0.21346, - "42": 0.11706, - "43": 0.12099, - "44": 0.1203, - "45": 0.11997, - "46": 0.12288, - "47": 0.12077, - "48": 0.11925, - "49": 0.11743, - "50": 0.11695 + "1": "nan", + "2": 3.84171, + "3": 0.13294, + "4": 0.11994, + "5": 0.11682, + "6": 0.11799, + "7": 0.12021, + "8": 0.11949, + "9": 0.1195, + "10": 0.12086, + "11": 0.21563, + "12": 0.12013, + "13": 0.1204, + "14": 0.1188, + "15": 0.1192, + "16": 0.11917, + "17": 0.11999, + "18": 0.12006, + "19": 0.11965, + "20": 0.12016, + "21": 0.21525, + "22": 0.11978, + "23": 0.12009, + "24": 0.12004, + "25": 0.12129, + "26": 0.12041, + "27": 0.12075, + "28": 0.12015, + "29": 0.1204, + "30": 0.12048, + "31": 0.21709, + "32": 0.12108, + "33": 0.11972, + "34": 0.12, + "35": 0.11969, + "36": 0.11944, + "37": 0.11946, + "38": 0.12056, + "39": 0.12045, + "40": 0.12052, + "41": 0.21777, + "42": 0.12063, + "43": 0.12165, + "44": 0.1204, + "45": 0.12036, + "46": 0.12154, + "47": 0.12043, + "48": 0.12145, + "49": 0.12079, + "50": 0.12035 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json index 641a00e237a..a34edb3389a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 9.86945, - "2": 0.13101, - "3": 0.70357, - "4": 0.14163, - "5": 0.12855, - "6": 0.38046, - "7": 0.126, - "8": 0.14086, - "9": 0.23777, - "10": 0.1402, - "11": 0.41122, - "12": 0.27395, - "13": 0.10789, - "14": 0.24924, - "15": 0.33411, - "16": 0.24471, - "17": 0.10663, - "18": 0.24551, - "19": 0.10581, - "20": 0.18449, - "21": 0.22744, - "22": 0.10616, - "23": 0.34738, - "24": 0.11037, - "25": 0.11093, - "26": 0.43374, - "27": 0.1067, - "28": 0.10671, - "29": 0.1061, - "30": 0.22031, - "31": 0.11271, - "32": 0.10683, - "33": 0.10556, - "34": 0.25465, - "35": 0.22935, - "36": 0.1072, - "37": 0.10789, - "38": 0.1067, - "39": 0.21523, - "40": 0.1053, - "41": 0.11778, - "42": 0.22642, - "43": 0.10673, - "44": 0.23278, - "45": 0.1046, - "46": 0.22439, - "47": 0.22232, - "48": 0.10912, - "49": 0.10674, - "50": 0.1055, - "51": 0.11049, - "52": 0.1948, - "53": 0.1045, - "54": 0.24019, - "55": 0.10505, - "56": 0.23176, - "57": 0.10745, - "58": 0.10668, - "59": 0.10741, - "60": 0.37464, - "61": 0.10467, - "62": 0.10857, - "63": 0.10767, - "64": 0.10998, - "65": 0.10888, - "66": 0.17063, - "67": 0.36721, - "68": 0.10834, - "69": 0.10693, - "70": 0.24024, - "71": 0.10802, - "72": 0.10696, - "73": 0.10736, - "74": 0.10874, - "75": 0.15339, - "76": 0.18985, - "77": 0.32078, - "78": 0.1062, - "79": 0.29068, - "80": 0.10837, - "81": 0.17251, - "82": 0.10428, - "83": 0.21093, - "84": 0.13349, - "85": 0.23049, - "86": 0.10991, - "87": 0.10573, - "88": 0.10661, - "89": 0.10792, - "90": 0.22654, - "91": 0.31392, - "92": 0.10844, - "93": 0.24022, - "94": 0.111, - "95": 0.10539, - "96": 0.109, - "97": 0.11025, - "98": 0.11065, - "99": 0.44653, - "100": 0.10883 + "1": "nan", + "2": 2.73603, + "3": 0.12344, + "4": 0.10783, + "5": 0.10595, + "6": 0.10649, + "7": 0.10691, + "8": 0.10679, + "9": 0.10607, + "10": 0.10675, + "11": 0.10687, + "12": 0.10636, + "13": 0.10663, + "14": 0.10668, + "15": 0.10696, + "16": 0.10672, + "17": 0.10678, + "18": 0.10603, + "19": 0.10659, + "20": 0.10684, + "21": 0.10766, + "22": 0.10849, + "23": 0.10853, + "24": 0.10805, + "25": 0.10776, + "26": 0.1069, + "27": 0.10818, + "28": 0.10669, + "29": 0.10643, + "30": 0.10634, + "31": 0.10766, + "32": 0.1076, + "33": 0.10583, + "34": 0.10631, + "35": 0.10587, + "36": 0.1054, + "37": 0.10589, + "38": 0.10633, + "39": 0.10593, + "40": 0.10674, + "41": 0.10812, + "42": 0.11127, + "43": 0.11494, + "44": 0.11409, + "45": 0.11538, + "46": 0.11702, + "47": 0.1155, + "48": 0.11481, + "49": 0.11507, + "50": 0.11401, + "51": 0.11655, + "52": 0.11513, + "53": 0.11379, + "54": 0.11378, + "55": 0.11658, + "56": 0.11792, + "57": 0.11792, + "58": 0.11715, + "59": 0.11915, + "60": 0.11642, + "61": 0.11578, + "62": 0.1171, + "63": 0.11758, + "64": 0.11517, + "65": 0.11624, + "66": 0.11434, + "67": 0.11609, + "68": 0.11506, + "69": 0.11568, + "70": 0.11661, + "71": 0.11647, + "72": 0.1166, + "73": 0.11795, + "74": 0.11661, + "75": 0.11785, + "76": 0.11659, + "77": 0.11531, + "78": 0.11705, + "79": 0.11662, + "80": 0.11765, + "81": 0.11829, + "82": 0.11742, + "83": 0.11529, + "84": 0.11678, + "85": 0.11581, + "86": 0.11703, + "87": 0.11699, + "88": 0.11641, + "89": 0.11638, + "90": 0.11586, + "91": 0.11853, + "92": 0.11725, + "93": 0.1178, + "94": 0.11647, + "95": 0.11672, + "96": 0.11702, + "97": 0.11754, + "98": 0.11614, + "99": 0.11757, + "100": 0.11708 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json index 56bb24659d2..0758fd3a8cf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 9.98291, - "2": 0.12743, - "3": 0.38059, - "4": 0.12841, - "5": 0.14511, - "6": 0.10173, - "7": 0.26971, - "8": 0.10382, - "9": 0.3953, - "10": 0.1058, - "11": 0.10231, - "12": 0.509, - "13": 0.10327, - "14": 0.23202, - "15": 0.12684, - "16": 0.10412, - "17": 0.2441, - "18": 0.10687, - "19": 0.25963, - "20": 0.10498, - "21": 0.25469, - "22": 0.10453, - "23": 0.10222, - "24": 0.25281, - "25": 0.1172, - "26": 0.10348, - "27": 0.29437, - "28": 0.10187, - "29": 0.28228, - "30": 0.1021, - "31": 0.23112, - "32": 0.10338, - "33": 0.24896, - "34": 0.10339, - "35": 0.24587, - "36": 0.22187, - "37": 0.10494, - "38": 0.10356, - "39": 0.10387, - "40": 0.1047, - "41": 0.10726, - "42": 0.10304, - "43": 0.22521, - "44": 0.12908, - "45": 0.21396, - "46": 0.32037, - "47": 0.10321, - "48": 0.10612, - "49": 0.46303, - "50": 0.10477, - "51": 0.11648, - "52": 0.10312, - "53": 0.10274, - "54": 0.10625, - "55": 0.10219, - "56": 0.24603, - "57": 0.10299, - "58": 0.10437, - "59": 0.10386, - "60": 0.10294, - "61": 0.26442, - "62": 0.10245, - "63": 0.17569, - "64": 0.10337, - "65": 0.23811, - "66": 0.10233, - "67": 0.23691, - "68": 0.21983, - "69": 0.19586, - "70": 0.10467, - "71": 0.10454, - "72": 0.1059, - "73": 0.10652, - "74": 0.14966, - "75": 0.10278, - "76": 0.39764, - "77": 0.10176, - "78": 0.23756, - "79": 0.10342, - "80": 0.24469, - "81": 0.10295, - "82": 0.26649, - "83": 0.105, - "84": 0.47883, - "85": 0.10596, - "86": 0.10525, - "87": 0.22714, - "88": 0.10536, - "89": 0.10595, - "90": 0.22588, - "91": 0.10237, - "92": 0.2621, - "93": 0.10543, - "94": 0.21938, - "95": 0.10276, - "96": 0.17373, - "97": 0.10501, - "98": 0.22197, - "99": 0.10635, - "100": 0.1032 + "1": "nan", + "2": 3.0922, + "3": 0.12788, + "4": 0.11451, + "5": 0.11407, + "6": 0.11304, + "7": 0.11565, + "8": 0.11482, + "9": 0.11841, + "10": 0.11916, + "11": 0.11884, + "12": 0.11911, + "13": 0.1155, + "14": 0.12253, + "15": 0.11369, + "16": 0.11887, + "17": 0.11433, + "18": 0.12243, + "19": 0.11544, + "20": 0.11344, + "21": 0.1254, + "22": 0.11712, + "23": 0.12494, + "24": 0.12239, + "25": 0.12344, + "26": 0.11952, + "27": 0.12117, + "28": 0.11916, + "29": 0.11974, + "30": 0.11517, + "31": 0.1219, + "32": 0.12112, + "33": 0.11997, + "34": 0.1133, + "35": 0.12245, + "36": 0.12118, + "37": 0.11239, + "38": 0.12174, + "39": 0.11964, + "40": 0.11993, + "41": 0.12013, + "42": 0.12614, + "43": 0.11697, + "44": 0.11669, + "45": 0.11781, + "46": 0.11776, + "47": 0.11182, + "48": 0.1196, + "49": 0.11814, + "50": 0.11736, + "51": 0.12093, + "52": 0.1107, + "53": 0.11502, + "54": 0.11571, + "55": 0.11493, + "56": 0.11712, + "57": 0.11663, + "58": 0.11203, + "59": 0.11604, + "60": 0.11649, + "61": 0.11616, + "62": 0.11641, + "63": 0.11603, + "64": 0.11613, + "65": 0.11708, + "66": 0.11292, + "67": 0.11356, + "68": 0.11416, + "69": 0.11305, + "70": 0.11582, + "71": 0.11552, + "72": 0.11318, + "73": 0.11798, + "74": 0.11632, + "75": 0.11624, + "76": 0.11602, + "77": 0.11547, + "78": 0.11457, + "79": 0.11402, + "80": 0.11415, + "81": 0.11627, + "82": 0.11295, + "83": 0.11397, + "84": 0.11221, + "85": 0.11326, + "86": 0.11792, + "87": 0.11391, + "88": 0.11365, + "89": 0.11478, + "90": 0.11346, + "91": 0.11213, + "92": 0.11712, + "93": 0.11574, + "94": 0.11724, + "95": 0.11254, + "96": 0.11871, + "97": 0.11957, + "98": 0.11759, + "99": 0.11864, + "100": 0.11833 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..f7efd011023 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.8426, + "52": 9.72578, + "53": 10.05977, + "54": 9.95226, + "55": 9.88321, + "56": 9.61276, + "57": 9.46222, + "58": 9.82313, + "59": 9.57665, + "60": 9.48518, + "61": 9.6788, + "62": 9.97777, + "63": 9.36212, + "64": 9.75714, + "65": 8.93499, + "66": 9.69281, + "67": 9.36709, + "68": 9.78179, + "69": 9.79451, + "70": 9.72295, + "71": 9.62027, + "72": 9.56974, + "73": 9.481, + "74": 8.91241, + "75": 9.40906, + "76": 9.06623, + "77": 10.05808, + "78": 9.72188, + "79": 9.36927, + "80": 9.40027, + "81": 9.47702, + "82": 9.69788, + "83": 9.30742, + "84": 9.41496, + "85": 9.61115, + "86": 9.07104, + "87": 9.59609, + "88": 9.74908, + "89": 9.5961, + "90": 9.82722, + "91": 9.3366, + "92": 9.3558, + "93": 9.08695, + "94": 8.82752, + "95": 9.53066, + "96": 9.52759, + "97": 9.30671, + "98": 9.66909, + "99": 8.89637, + "100": 9.4052 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2587.0, + "52": 2574.0, + "53": 2831.0, + "54": 2602.0, + "55": 2403.0, + "56": 2822.0, + "57": 2223.0, + "58": 2954.0, + "59": 2871.0, + "60": 2518.0, + "61": 2922.0, + "62": 2677.0, + "63": 2533.0, + "64": 3023.0, + "65": 2609.0, + "66": 2960.0, + "67": 2867.0, + "68": 2652.0, + "69": 3053.0, + "70": 3011.0, + "71": 2870.0, + "72": 2460.0, + "73": 3114.0, + "74": 2017.0, + "75": 2527.0, + "76": 2954.0, + "77": 2955.0, + "78": 3055.0, + "79": 3098.0, + "80": 3047.0, + "81": 3362.0, + "82": 3296.0, + "83": 2825.0, + "84": 3113.0, + "85": 3196.0, + "86": 2666.0, + "87": 3583.0, + "88": 2985.0, + "89": 3259.0, + "90": 3220.0, + "91": 2781.0, + "92": 3090.0, + "93": 2686.0, + "94": 3474.0, + "95": 3147.0, + "96": 3418.0, + "97": 3036.0, + "98": 3411.0, + "99": 3152.0, + "100": 3098.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 763221504.0, + "52": 763221504.0, + "53": 763221504.0, + "54": 763221504.0, + "55": 763221504.0, + "56": 763221504.0, + "57": 763221504.0, + "58": 763221504.0, + "59": 763221504.0, + "60": 763221504.0, + "61": 763221504.0, + "62": 763221504.0, + "63": 763221504.0, + "64": 763221504.0, + "65": 763221504.0, + "66": 763221504.0, + "67": 763221504.0, + "68": 763221504.0, + "69": 763221504.0, + "70": 763221504.0, + "71": 763221504.0, + "72": 763221504.0, + "73": 763221504.0, + "74": 763221504.0, + "75": 763221504.0, + "76": 763221504.0, + "77": 763221504.0, + "78": 763221504.0, + "79": 763221504.0, + "80": 763221504.0, + "81": 763221504.0, + "82": 763221504.0, + "83": 763221504.0, + "84": 763221504.0, + "85": 763221504.0, + "86": 763221504.0, + "87": 763221504.0, + "88": 763221504.0, + "89": 763221504.0, + "90": 763221504.0, + "91": 763221504.0, + "92": 763221504.0, + "93": 763221504.0, + "94": 763221504.0, + "95": 763221504.0, + "96": 763221504.0, + "97": 763221504.0, + "98": 763221504.0, + "99": 763221504.0, + "100": 763221504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2682096640.0, + "52": 2682097664.0, + "53": 2682097664.0, + "54": 2682097664.0, + "55": 2682097664.0, + "56": 2682097664.0, + "57": 2682097664.0, + "58": 2682097664.0, + "59": 2682097664.0, + "60": 2682097664.0, + "61": 2682097664.0, + "62": 2682097664.0, + "63": 2682097664.0, + "64": 2682097664.0, + "65": 2682097664.0, + "66": 2682097664.0, + "67": 2682097664.0, + "68": 2682097664.0, + "69": 2682097664.0, + "70": 2682097664.0, + "71": 2682097664.0, + "72": 2682097664.0, + "73": 2682097664.0, + "74": 2682097664.0, + "75": 2682097664.0, + "76": 2682097664.0, + "77": 2682097664.0, + "78": 2682097664.0, + "79": 2682097664.0, + "80": 2682097664.0, + "81": 2682097664.0, + "82": 2682097664.0, + "83": 2682097664.0, + "84": 2682097664.0, + "85": 2682097664.0, + "86": 2682097664.0, + "87": 2682097664.0, + "88": 2682097664.0, + "89": 2682097664.0, + "90": 2682097664.0, + "91": 2682097664.0, + "92": 2682097664.0, + "93": 2682097664.0, + "94": 2682097664.0, + "95": 2682097664.0, + "96": 2682097664.0, + "97": 2682097664.0, + "98": 2682097664.0, + "99": 2682097664.0, + "100": 2682097664.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 3.06236, + "53": 0.13884, + "54": 0.12077, + "55": 0.12587, + "56": 0.12115, + "57": 0.12166, + "58": 0.12353, + "59": 0.1247, + "60": 0.12221, + "61": 0.12159, + "62": 0.12136, + "63": 0.13043, + "64": 0.12973, + "65": 0.13067, + "66": 0.14918, + "67": 0.11954, + "68": 0.11631, + "69": 0.11511, + "70": 0.11621, + "71": 0.11553, + "72": 0.11537, + "73": 0.11691, + "74": 0.11875, + "75": 0.11769, + "76": 0.11586, + "77": 0.11847, + "78": 0.11896, + "79": 0.11697, + "80": 0.11854, + "81": 0.11758, + "82": 0.11531, + "83": 0.11776, + "84": 0.11613, + "85": 0.11822, + "86": 0.11858, + "87": 0.11763, + "88": 0.11691, + "89": 0.11931, + "90": 0.11678, + "91": 0.11601, + "92": 0.11377, + "93": 0.11692, + "94": 0.11741, + "95": 0.11634, + "96": 0.1145, + "97": 0.12011, + "98": 0.11722, + "99": 0.11609, + "100": 0.11641 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json index 3d38faf23fc..28843c12217 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json @@ -262,10 +262,10 @@ "42": 552325632.0, "43": 552325632.0, "44": 552325632.0, - "45": 552325632.0, + "45": 553374208.0, "46": 552325632.0, "47": 552325632.0, - "48": 552325632.0, + "48": 553374208.0, "49": 552325632.0, "50": 552325632.0, "51": 552325632.0, @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 10.20216, - "2": 0.13277, - "3": 0.2975, - "4": 0.10754, - "5": 0.10418, - "6": 0.10612, - "7": 0.24272, - "8": 0.11347, - "9": 0.14845, - "10": 0.10733, - "11": 0.10387, - "12": 0.47615, - "13": 0.26992, - "14": 0.10483, - "15": 0.1204, - "16": 0.10696, - "17": 0.10552, - "18": 0.10521, - "19": 0.24005, - "20": 0.17139, - "21": 0.13276, - "22": 0.21348, - "23": 0.10526, - "24": 0.23652, - "25": 0.10695, - "26": 0.105, - "27": 0.1046, - "28": 0.108, - "29": 0.22645, - "30": 0.10764, - "31": 0.37801, - "32": 0.10822, - "33": 0.26043, - "34": 0.10725, - "35": 0.10759, - "36": 0.10627, - "37": 0.10521, - "38": 0.23173, - "39": 0.23132, - "40": 0.10561, - "41": 0.10865, - "42": 0.10488, - "43": 0.10774, - "44": 0.10716, - "45": 0.2275, - "46": 0.10501, - "47": 0.26542, - "48": 0.10561, - "49": 0.10565, - "50": 0.21987, - "51": 0.12154, - "52": 0.10569, - "53": 0.10443, - "54": 0.1047, - "55": 0.10628, - "56": 0.106, - "57": 0.21826, - "58": 0.29942, - "59": 0.10627, - "60": 0.10754, - "61": 0.10422, - "62": 0.10591, - "63": 0.22208, - "64": 0.10704, - "65": 0.10754, - "66": 0.11693, - "67": 0.10619, - "68": 0.10599, - "69": 0.1064, - "70": 0.10712, - "71": 0.20506, - "72": 0.12154, - "73": 0.10701, - "74": 0.10797, - "75": 0.10599, - "76": 0.11118, - "77": 0.22203, - "78": 0.11082, - "79": 0.10971, - "80": 0.10673, - "81": 0.23373, - "82": 0.25241, - "83": 0.10924, - "84": 0.23617, - "85": 0.10907, - "86": 0.10895, - "87": 0.21649, - "88": 0.1977, - "89": 0.1081, - "90": 0.10767, - "91": 0.2306, - "92": 0.1072, - "93": 0.11204, - "94": 0.22079, - "95": 0.10723, - "96": 0.10789, - "97": 0.10605, - "98": 0.10621, - "99": 0.26274, - "100": 0.10674 + "1": "nan", + "2": 3.33855, + "3": 0.12562, + "4": 0.10973, + "5": 0.10864, + "6": 0.10778, + "7": 0.10885, + "8": 0.10884, + "9": 0.10877, + "10": 0.10868, + "11": 0.10997, + "12": 0.10853, + "13": 0.1086, + "14": 0.10927, + "15": 0.10879, + "16": 0.10908, + "17": 0.10873, + "18": 0.10883, + "19": 0.11028, + "20": 0.11031, + "21": 0.11086, + "22": 0.10971, + "23": 0.10987, + "24": 0.1089, + "25": 0.11118, + "26": 0.10952, + "27": 0.1165, + "28": 0.11961, + "29": 0.11977, + "30": 0.11657, + "31": 0.11728, + "32": 0.11689, + "33": 0.11642, + "34": 0.11739, + "35": 0.11665, + "36": 0.11537, + "37": 0.11552, + "38": 0.11544, + "39": 0.11538, + "40": 0.11584, + "41": 0.11597, + "42": 0.11635, + "43": 0.11593, + "44": 0.11678, + "45": 0.11608, + "46": 0.11637, + "47": 0.11572, + "48": 0.11577, + "49": 0.11481, + "50": 0.11561, + "51": 0.1213, + "52": 0.10892, + "53": 0.10742, + "54": 0.10842, + "55": 0.10806, + "56": 0.10869, + "57": 0.11057, + "58": 0.108, + "59": 0.10875, + "60": 0.10969, + "61": 0.1087, + "62": 0.10795, + "63": 0.1094, + "64": 0.10922, + "65": 0.11102, + "66": 0.11016, + "67": 0.10977, + "68": 0.10988, + "69": 0.11029, + "70": 0.11078, + "71": 0.11019, + "72": 0.11727, + "73": 0.11024, + "74": 0.11054, + "75": 0.10949, + "76": 0.11384, + "77": 0.11011, + "78": 0.1101, + "79": 0.10943, + "80": 0.11059, + "81": 0.11173, + "82": 0.10987, + "83": 0.1094, + "84": 0.10956, + "85": 0.11029, + "86": 0.11179, + "87": 0.10953, + "88": 0.11045, + "89": 0.1102, + "90": 0.10897, + "91": 0.11022, + "92": 0.10965, + "93": 0.11042, + "94": 0.11158, + "95": 0.11059, + "96": 0.11046, + "97": 0.11123, + "98": 0.11055, + "99": 0.11178, + "100": 0.11266 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..43ec9ec960f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.84256, + "52": 9.72573, + "53": 10.05974, + "54": 9.95226, + "55": 9.88318, + "56": 9.61275, + "57": 9.46219, + "58": 9.8231, + "59": 9.57666, + "60": 9.48516, + "61": 9.67876, + "62": 9.97782, + "63": 9.36212, + "64": 9.75714, + "65": 8.93494, + "66": 9.69283, + "67": 9.36708, + "68": 9.78178, + "69": 9.79452, + "70": 9.72296, + "71": 9.62031, + "72": 9.56974, + "73": 9.48101, + "74": 8.91241, + "75": 9.40905, + "76": 9.06617, + "77": 10.05809, + "78": 9.72194, + "79": 9.36927, + "80": 9.40029, + "81": 9.47702, + "82": 9.69787, + "83": 9.30742, + "84": 9.41492, + "85": 9.61113, + "86": 9.07103, + "87": 9.5961, + "88": 9.74909, + "89": 9.59604, + "90": 9.82722, + "91": 9.33657, + "92": 9.35582, + "93": 9.08689, + "94": 8.82754, + "95": 9.53065, + "96": 9.5276, + "97": 9.30672, + "98": 9.66905, + "99": 8.89635, + "100": 9.40525 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2552.0, + "52": 2565.0, + "53": 2883.0, + "54": 2710.0, + "55": 2301.0, + "56": 2798.0, + "57": 2334.0, + "58": 2979.0, + "59": 2960.0, + "60": 2451.0, + "61": 2841.0, + "62": 2577.0, + "63": 2516.0, + "64": 2907.0, + "65": 2567.0, + "66": 2862.0, + "67": 2809.0, + "68": 2609.0, + "69": 2965.0, + "70": 2985.0, + "71": 2864.0, + "72": 2613.0, + "73": 3108.0, + "74": 2048.0, + "75": 2563.0, + "76": 3046.0, + "77": 3127.0, + "78": 2959.0, + "79": 3082.0, + "80": 3025.0, + "81": 3400.0, + "82": 3223.0, + "83": 2786.0, + "84": 3180.0, + "85": 3233.0, + "86": 2611.0, + "87": 3542.0, + "88": 3084.0, + "89": 3210.0, + "90": 3271.0, + "91": 2770.0, + "92": 3220.0, + "93": 2662.0, + "94": 3405.0, + "95": 3085.0, + "96": 3336.0, + "97": 3050.0, + "98": 3421.0, + "99": 3271.0, + "100": 3079.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 552064512.0, + "52": 552064512.0, + "53": 552064512.0, + "54": 552064512.0, + "55": 552064512.0, + "56": 552064512.0, + "57": 552064512.0, + "58": 552064512.0, + "59": 552064512.0, + "60": 552064512.0, + "61": 552064512.0, + "62": 552064512.0, + "63": 552064512.0, + "64": 552064512.0, + "65": 552064512.0, + "66": 552064512.0, + "67": 552064512.0, + "68": 552064512.0, + "69": 552064512.0, + "70": 552064512.0, + "71": 552064512.0, + "72": 552064512.0, + "73": 552064512.0, + "74": 552064512.0, + "75": 552064512.0, + "76": 552064512.0, + "77": 552064512.0, + "78": 552064512.0, + "79": 552064512.0, + "80": 552064512.0, + "81": 552064512.0, + "82": 552064512.0, + "83": 552064512.0, + "84": 552064512.0, + "85": 552064512.0, + "86": 552064512.0, + "87": 552064512.0, + "88": 552064512.0, + "89": 552064512.0, + "90": 552064512.0, + "91": 552064512.0, + "92": 552064512.0, + "93": 552064512.0, + "94": 552064512.0, + "95": 552064512.0, + "96": 552064512.0, + "97": 552064512.0, + "98": 552064512.0, + "99": 552064512.0, + "100": 552064512.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2470941696.0, + "52": 2470942208.0, + "53": 2470942208.0, + "54": 2470942208.0, + "55": 2470942208.0, + "56": 2470942208.0, + "57": 2470942208.0, + "58": 2470942208.0, + "59": 2470942208.0, + "60": 2470942208.0, + "61": 2470942720.0, + "62": 2470942720.0, + "63": 2470942720.0, + "64": 2470942720.0, + "65": 2470942720.0, + "66": 2470942720.0, + "67": 2470942720.0, + "68": 2470942720.0, + "69": 2470942720.0, + "70": 2470942720.0, + "71": 2470942720.0, + "72": 2470942720.0, + "73": 2470942720.0, + "74": 2470942720.0, + "75": 2470942720.0, + "76": 2470942720.0, + "77": 2470942720.0, + "78": 2470942720.0, + "79": 2470942720.0, + "80": 2470942720.0, + "81": 2470942720.0, + "82": 2470942720.0, + "83": 2470942720.0, + "84": 2470942720.0, + "85": 2470942720.0, + "86": 2470942720.0, + "87": 2470942720.0, + "88": 2470942720.0, + "89": 2470942720.0, + "90": 2470942720.0, + "91": 2470942720.0, + "92": 2470942720.0, + "93": 2470942720.0, + "94": 2470942720.0, + "95": 2470942720.0, + "96": 2470942720.0, + "97": 2470942720.0, + "98": 2470942720.0, + "99": 2470942720.0, + "100": 2470942720.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 3.27475, + "53": 0.12363, + "54": 0.11177, + "55": 0.1112, + "56": 0.11272, + "57": 0.11284, + "58": 0.11161, + "59": 0.11167, + "60": 0.11262, + "61": 0.11171, + "62": 0.11092, + "63": 0.11143, + "64": 0.11171, + "65": 0.11299, + "66": 0.1124, + "67": 0.1119, + "68": 0.11174, + "69": 0.11252, + "70": 0.11217, + "71": 0.1112, + "72": 0.11653, + "73": 0.11887, + "74": 0.11966, + "75": 0.11921, + "76": 0.12192, + "77": 0.1219, + "78": 0.12342, + "79": 0.12312, + "80": 0.12263, + "81": 0.12762, + "82": 0.1234, + "83": 0.12364, + "84": 0.12458, + "85": 0.12385, + "86": 0.12395, + "87": 0.12307, + "88": 0.12362, + "89": 0.12421, + "90": 0.12452, + "91": 0.12623, + "92": 0.1253, + "93": 0.12482, + "94": 0.12453, + "95": 0.12892, + "96": 0.13902, + "97": 0.12489, + "98": 0.12331, + "99": 0.12522, + "100": 0.12499 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json index f4999e7c2dd..8a90b6fb7df 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 9.62286, - "2": 0.48262, - "3": 0.20639, - "4": 0.31026, - "5": 0.31827, - "6": 0.29163, - "7": 0.29838, - "8": 0.17787, - "9": 0.27978, - "10": 0.17026, - "11": 0.27026, - "12": 0.1834, - "13": 0.19697, - "14": 0.43123, - "15": 0.18322, - "16": 0.18141, - "17": 0.19707, - "18": 0.4629, - "19": 0.1817, - "20": 0.25096, - "21": 0.18877, - "22": 0.24459, - "23": 0.17984, - "24": 0.20058, - "25": 0.1758, - "26": 0.17872, - "27": 0.17193, - "28": 0.17115, - "29": 0.36031, - "30": 0.2658, - "31": 0.16933, - "32": 0.20868, - "33": 0.17195, - "34": 0.17439, - "35": 0.2501, - "36": 0.17686, - "37": 0.20398, - "38": 0.32448, - "39": 0.1735, - "40": 0.17268, - "41": 0.33455, - "42": 0.23584, - "43": 0.23483, - "44": 0.16767, - "45": 0.17612, - "46": 0.30477, - "47": 0.37075, - "48": 0.18367, - "49": 0.25006, - "50": 0.56439 + "1": "nan", + "2": 3.1265, + "3": 0.15779, + "4": 0.14192, + "5": 0.14446, + "6": 0.14251, + "7": 0.14375, + "8": 0.1446, + "9": 0.14351, + "10": 0.14568, + "11": 0.14477, + "12": 0.14491, + "13": 0.1447, + "14": 0.14656, + "15": 0.14652, + "16": 0.14521, + "17": 0.14638, + "18": 0.14483, + "19": 0.14549, + "20": 0.14457, + "21": 0.14306, + "22": 0.14559, + "23": 0.14596, + "24": 0.14513, + "25": 0.14367, + "26": 0.14368, + "27": 0.14398, + "28": 0.14369, + "29": 0.14435, + "30": 0.14415, + "31": 0.1433, + "32": 0.14342, + "33": 0.1441, + "34": 0.14372, + "35": 0.14431, + "36": 0.1454, + "37": 0.14634, + "38": 0.14514, + "39": 0.14529, + "40": 0.14504, + "41": 0.14496, + "42": 0.14436, + "43": 0.14492, + "44": 0.14452, + "45": 0.14629, + "46": 0.14514, + "47": 0.14578, + "48": 0.1442, + "49": 0.14396, + "50": 0.14376 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json index 605457b437c..da72109d85d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 8.46203, - "2": 0.17159, - "3": 0.30409, - "4": 0.13684, - "5": 0.29184, - "6": 0.13641, - "7": 0.15548, - "8": 0.24827, - "9": 0.13458, - "10": 0.24758, - "11": 0.26919, - "12": 0.15859, - "13": 0.24263, - "14": 0.40638, - "15": 0.14802, - "16": 0.75916, - "17": 0.27027, - "18": 0.41589, - "19": 0.23222, - "20": 0.27356, - "21": 0.38604, - "22": 0.40542, - "23": 0.61332, - "24": 0.36261, - "25": 0.60934, - "26": 0.13901, - "27": 0.23646, - "28": 0.13727, - "29": 0.23988, - "30": 0.13874, - "31": 0.13771, - "32": 0.13771, - "33": 0.13803, - "34": 0.13667, - "35": 0.13906, - "36": 0.13535, - "37": 0.13539, - "38": 0.13547, - "39": 0.13555, - "40": 0.13617, - "41": 0.37768, - "42": 0.1374, - "43": 0.22178, - "44": 0.13712, - "45": 0.13831, - "46": 0.137, - "47": 0.13638, - "48": 0.13731, - "49": 0.21987, - "50": 0.13794 + "1": "nan", + "2": 2.44061, + "3": 0.15375, + "4": 0.14111, + "5": 0.14053, + "6": 0.14023, + "7": 0.14152, + "8": 0.14128, + "9": 0.1417, + "10": 0.14155, + "11": 0.14076, + "12": 0.1405, + "13": 0.14129, + "14": 0.14106, + "15": 0.14101, + "16": 0.14178, + "17": 0.14173, + "18": 0.14103, + "19": 0.14094, + "20": 0.14012, + "21": 0.14153, + "22": 0.14228, + "23": 0.14118, + "24": 0.14079, + "25": 0.14034, + "26": 0.14027, + "27": 0.13947, + "28": 0.13928, + "29": 0.1398, + "30": 0.14085, + "31": 0.14179, + "32": 0.13944, + "33": 0.14174, + "34": 0.1436, + "35": 0.13902, + "36": 0.13933, + "37": 0.13922, + "38": 0.13997, + "39": 0.13881, + "40": 0.13924, + "41": 0.1392, + "42": 0.14092, + "43": 0.14136, + "44": 0.14035, + "45": 0.13841, + "46": 0.1411, + "47": 0.13878, + "48": 0.14005, + "49": 0.13925, + "50": 0.13845 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json index d3f4ebb9b68..28de0d56b1b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 8.84804, - "2": 0.09127, - "3": 0.38568, - "4": 0.10516, - "5": 0.18187, - "6": 0.13288, - "7": 0.17979, - "8": 0.25055, - "9": 0.07376, - "10": 0.06396, - "11": 0.42421, - "12": 0.06524, - "13": 0.06447, - "14": 0.06499, - "15": 0.24593, - "16": 0.06277, - "17": 0.2443, - "18": 0.26141, - "19": 0.06388, + "1": "nan", + "2": 2.91983, + "3": 0.07964, + "4": 0.0755, + "5": 0.06521, + "6": 0.07949, + "7": 0.0691, + "8": 0.06527, + "9": 0.09221, + "10": 0.0948, + "11": 0.07486, + "12": 0.06312, + "13": 0.06422, + "14": 0.0656, + "15": 0.07274, + "16": 0.06384, + "17": 0.06441, + "18": 0.06446, + "19": 0.06349, "20": 0.06319, - "21": 0.44504, - "22": 0.06309, - "23": 0.24094, - "24": 0.06366, - "25": 0.12615, - "26": 0.45347, - "27": 0.06454, - "28": 0.06518, - "29": 0.23896, - "30": 0.06569, - "31": 0.23519, - "32": 0.06271, - "33": 0.06599, - "34": 0.45696, - "35": 0.06614, - "36": 0.24275, - "37": 0.0626, - "38": 0.18028, - "39": 0.07237, - "40": 0.24435, - "41": 0.09656, - "42": 0.258, - "43": 0.09133, - "44": 0.09694, - "45": 0.11452, - "46": 0.08793, - "47": 0.24321, - "48": 0.08548, - "49": 0.0909, - "50": 0.16493 + "21": 0.06302, + "22": 0.06467, + "23": 0.06428, + "24": 0.06395, + "25": 0.06411, + "26": 0.06398, + "27": 0.06336, + "28": 0.06345, + "29": 0.07201, + "30": 0.06458, + "31": 0.06379, + "32": 0.06337, + "33": 0.06262, + "34": 0.06257, + "35": 0.06407, + "36": 0.06308, + "37": 0.06324, + "38": 0.06353, + "39": 0.06346, + "40": 0.06294, + "41": 0.06471, + "42": 0.06426, + "43": 0.06446, + "44": 0.06426, + "45": 0.06337, + "46": 0.06427, + "47": 0.06421, + "48": 0.06315, + "49": 0.0639, + "50": 0.06324 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json index 2de96fdc0a6..a5fecfacf8f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json @@ -6,104 +6,104 @@ "values": { "1": 10.87037, "2": 10.87119, - "3": 10.84722, - "4": 10.83185, - "5": 10.86876, - "6": 10.88753, - "7": 10.86095, - "8": 10.86864, - "9": 10.85424, - "10": 10.82319, - "11": 10.86739, - "12": 10.8666, - "13": 10.88538, - "14": 10.88994, - "15": 10.81366, - "16": 10.80332, + "3": 10.84723, + "4": 10.83181, + "5": 10.86879, + "6": 10.8876, + "7": 10.86094, + "8": 10.86859, + "9": 10.85425, + "10": 10.82318, + "11": 10.86741, + "12": 10.86662, + "13": 10.88537, + "14": 10.88991, + "15": 10.81369, + "16": 10.80328, "17": 10.77723, - "18": 10.81063, - "19": 10.80524, - "20": 10.70339, - "21": 10.67012, - "22": 10.51209, - "23": 10.69985, + "18": 10.81064, + "19": 10.80526, + "20": 10.7034, + "21": 10.67013, + "22": 10.51206, + "23": 10.69987, "24": 10.56044, - "25": 10.49857, - "26": 10.57872, - "27": 10.56749, - "28": 10.53108, + "25": 10.49854, + "26": 10.57876, + "27": 10.56747, + "28": 10.53107, "29": 10.55838, - "30": 10.32727, - "31": 10.04391, - "32": 10.42571, - "33": 10.4193, - "34": 10.15675, + "30": 10.32726, + "31": 10.04382, + "32": 10.42576, + "33": 10.41931, + "34": 10.15673, "35": 10.21897, "36": 10.16206, - "37": 10.29722, - "38": 10.13231, - "39": 10.35956, - "40": 10.02296, + "37": 10.29717, + "38": 10.1323, + "39": 10.35955, + "40": 10.02292, "41": 10.06592, - "42": 10.15518, - "43": 9.75609, - "44": 9.86983, - "45": 9.75094, - "46": 9.73598, - "47": 10.0747, - "48": 9.77504, - "49": 9.43418, - "50": 9.84339, - "51": 9.78577, - "52": 9.6708, - "53": 10.00723, + "42": 10.15514, + "43": 9.7561, + "44": 9.86986, + "45": 9.75091, + "46": 9.73604, + "47": 10.07473, + "48": 9.77502, + "49": 9.43421, + "50": 9.84343, + "51": 9.78575, + "52": 9.67077, + "53": 10.00722, "54": 9.89701, - "55": 9.82612, + "55": 9.82613, "56": 9.54829, - "57": 9.40077, - "58": 9.77422, + "57": 9.40075, + "58": 9.77419, "59": 9.51686, - "60": 9.42721, - "61": 9.63408, - "62": 9.93879, + "60": 9.42722, + "61": 9.63404, + "62": 9.93883, "63": 9.30503, - "64": 9.71266, - "65": 8.86836, - "66": 9.64474, - "67": 9.31349, - "68": 9.73443, + "64": 9.71265, + "65": 8.86835, + "66": 9.64476, + "67": 9.31344, + "68": 9.73448, "69": 9.755, - "70": 9.68613, - "71": 9.57703, - "72": 9.53066, - "73": 9.43092, - "74": 8.8548, - "75": 9.35819, - "76": 9.01448, - "77": 10.0265, - "78": 9.68108, - "79": 9.33349, - "80": 9.35488, - "81": 9.44135, + "70": 9.68616, + "71": 9.57699, + "72": 9.53063, + "73": 9.43094, + "74": 8.85481, + "75": 9.35821, + "76": 9.01443, + "77": 10.02645, + "78": 9.6811, + "79": 9.33347, + "80": 9.35483, + "81": 9.44132, "82": 9.66188, - "83": 9.26313, - "84": 9.37185, + "83": 9.26309, + "84": 9.37181, "85": 9.57429, - "86": 9.03444, + "86": 9.0344, "87": 9.56188, - "88": 9.71281, - "89": 9.55802, + "88": 9.71279, + "89": 9.55801, "90": 9.79197, - "91": 9.29019, - "92": 9.31615, - "93": 9.04052, - "94": 8.78281, + "91": 9.29017, + "92": 9.31612, + "93": 9.04053, + "94": 8.78283, "95": 9.49395, - "96": 9.48884, - "97": 9.26046, - "98": 9.63128, - "99": 8.85093, - "100": 9.36489 + "96": 9.48877, + "97": 9.26048, + "98": 9.63126, + "99": 8.85095, + "100": 9.36493 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 623.0, - "2": 605.0, - "3": 617.0, - "4": 657.0, - "5": 652.0, - "6": 662.0, - "7": 595.0, - "8": 626.0, - "9": 683.0, - "10": 550.0, - "11": 658.0, - "12": 657.0, - "13": 701.0, - "14": 668.0, - "15": 659.0, - "16": 692.0, - "17": 640.0, - "18": 627.0, - "19": 633.0, - "20": 601.0, - "21": 632.0, - "22": 637.0, - "23": 718.0, - "24": 623.0, - "25": 612.0, - "26": 689.0, - "27": 678.0, - "28": 717.0, - "29": 715.0, - "30": 670.0, - "31": 627.0, - "32": 718.0, - "33": 850.0, - "34": 658.0, - "35": 721.0, - "36": 764.0, - "37": 859.0, - "38": 733.0, - "39": 851.0, - "40": 766.0, - "41": 863.0, - "42": 839.0, - "43": 732.0, - "44": 870.0, - "45": 737.0, - "46": 913.0, - "47": 911.0, - "48": 832.0, - "49": 825.0, - "50": 827.0, - "51": 914.0, - "52": 900.0, - "53": 989.0, - "54": 1021.0, - "55": 874.0, - "56": 985.0, - "57": 841.0, - "58": 938.0, - "59": 1035.0, - "60": 876.0, - "61": 1044.0, - "62": 982.0, - "63": 976.0, - "64": 1071.0, - "65": 1026.0, - "66": 994.0, - "67": 961.0, - "68": 1084.0, - "69": 1108.0, - "70": 1081.0, - "71": 1069.0, - "72": 931.0, - "73": 984.0, - "74": 770.0, - "75": 914.0, - "76": 1050.0, - "77": 1196.0, - "78": 1128.0, - "79": 1048.0, - "80": 1147.0, - "81": 1175.0, - "82": 1112.0, - "83": 988.0, - "84": 1099.0, - "85": 1133.0, - "86": 875.0, - "87": 1189.0, - "88": 1114.0, - "89": 1101.0, - "90": 1124.0, - "91": 1079.0, - "92": 1114.0, - "93": 937.0, - "94": 1106.0, - "95": 1097.0, + "1": 617.0, + "2": 610.0, + "3": 593.0, + "4": 664.0, + "5": 648.0, + "6": 669.0, + "7": 605.0, + "8": 612.0, + "9": 596.0, + "10": 559.0, + "11": 659.0, + "12": 590.0, + "13": 663.0, + "14": 675.0, + "15": 672.0, + "16": 715.0, + "17": 627.0, + "18": 614.0, + "19": 680.0, + "20": 570.0, + "21": 674.0, + "22": 593.0, + "23": 633.0, + "24": 612.0, + "25": 606.0, + "26": 669.0, + "27": 625.0, + "28": 753.0, + "29": 733.0, + "30": 661.0, + "31": 648.0, + "32": 688.0, + "33": 786.0, + "34": 689.0, + "35": 675.0, + "36": 734.0, + "37": 807.0, + "38": 799.0, + "39": 831.0, + "40": 745.0, + "41": 780.0, + "42": 868.0, + "43": 713.0, + "44": 751.0, + "45": 817.0, + "46": 856.0, + "47": 934.0, + "48": 906.0, + "49": 840.0, + "50": 799.0, + "51": 923.0, + "52": 897.0, + "53": 1019.0, + "54": 908.0, + "55": 839.0, + "56": 976.0, + "57": 853.0, + "58": 1024.0, + "59": 988.0, + "60": 870.0, + "61": 1041.0, + "62": 961.0, + "63": 847.0, + "64": 1053.0, + "65": 1004.0, + "66": 1005.0, + "67": 938.0, + "68": 1006.0, + "69": 1110.0, + "70": 985.0, + "71": 1002.0, + "72": 958.0, + "73": 997.0, + "74": 705.0, + "75": 870.0, + "76": 1088.0, + "77": 1153.0, + "78": 1083.0, + "79": 1035.0, + "80": 1122.0, + "81": 1209.0, + "82": 1132.0, + "83": 1024.0, + "84": 1122.0, + "85": 1141.0, + "86": 861.0, + "87": 1190.0, + "88": 1115.0, + "89": 1128.0, + "90": 1107.0, + "91": 1128.0, + "92": 1160.0, + "93": 973.0, + "94": 1117.0, + "95": 1022.0, "96": 1178.0, - "97": 1103.0, - "98": 1260.0, - "99": 1105.0, - "100": 1131.0 + "97": 1068.0, + "98": 1278.0, + "99": 1071.0, + "100": 1175.0 } }, "mem-allocated-bytes": { @@ -218,106 +218,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 637976064.0, - "2": 637976064.0, - "3": 637976064.0, - "4": 637976064.0, - "5": 637976064.0, - "6": 637976064.0, - "7": 637976064.0, - "8": 637976064.0, - "9": 637976064.0, - "10": 637976064.0, - "11": 637976064.0, - "12": 637976064.0, - "13": 637976064.0, - "14": 637976064.0, - "15": 637976064.0, - "16": 637976064.0, - "17": 637976064.0, - "18": 637976064.0, - "19": 637976064.0, - "20": 637976064.0, - "21": 637976064.0, - "22": 637976064.0, - "23": 637976064.0, - "24": 637976064.0, - "25": 637976064.0, - "26": 637976064.0, - "27": 637976064.0, - "28": 637976064.0, - "29": 637976064.0, - "30": 637976064.0, - "31": 637976064.0, - "32": 637976064.0, - "33": 637976064.0, - "34": 637976064.0, - "35": 637976064.0, - "36": 637976064.0, - "37": 637976064.0, - "38": 637976064.0, - "39": 637976064.0, - "40": 637976064.0, - "41": 637976064.0, - "42": 637976064.0, - "43": 637976064.0, - "44": 637976064.0, - "45": 637976064.0, - "46": 637976064.0, - "47": 637976064.0, - "48": 637976064.0, - "49": 637976064.0, - "50": 637976064.0, - "51": 637976064.0, - "52": 637976064.0, - "53": 637976064.0, - "54": 637976064.0, - "55": 637976064.0, - "56": 637976064.0, - "57": 637976064.0, - "58": 637976064.0, - "59": 637976064.0, - "60": 637976064.0, - "61": 637976064.0, - "62": 637976064.0, - "63": 637976064.0, - "64": 637976064.0, - "65": 637976064.0, - "66": 637976064.0, - "67": 637976064.0, - "68": 637976064.0, - "69": 637976064.0, - "70": 637976064.0, - "71": 637976064.0, - "72": 637976064.0, - "73": 637976064.0, - "74": 637976064.0, - "75": 637976064.0, - "76": 637976064.0, - "77": 637976064.0, - "78": 637976064.0, - "79": 637976064.0, - "80": 637976064.0, - "81": 637976064.0, - "82": 637976064.0, - "83": 637976064.0, - "84": 637976064.0, - "85": 637976064.0, - "86": 637976064.0, - "87": 637976064.0, - "88": 637976064.0, - "89": 637976064.0, - "90": 637976064.0, - "91": 637976064.0, - "92": 637976064.0, - "93": 637976064.0, - "94": 637976064.0, - "95": 637976064.0, - "96": 637976064.0, - "97": 637976064.0, - "98": 637976064.0, - "99": 637976064.0, - "100": 637976064.0 + "1": 638631424.0, + "2": 638631424.0, + "3": 638631424.0, + "4": 638631424.0, + "5": 638631424.0, + "6": 638631424.0, + "7": 638631424.0, + "8": 638631424.0, + "9": 638631424.0, + "10": 638631424.0, + "11": 638631424.0, + "12": 638631424.0, + "13": 638631424.0, + "14": 638631424.0, + "15": 638631424.0, + "16": 638631424.0, + "17": 638631424.0, + "18": 638631424.0, + "19": 638631424.0, + "20": 638631424.0, + "21": 638631424.0, + "22": 638631424.0, + "23": 638631424.0, + "24": 638631424.0, + "25": 638631424.0, + "26": 638631424.0, + "27": 638631424.0, + "28": 638631424.0, + "29": 638631424.0, + "30": 638631424.0, + "31": 638631424.0, + "32": 638631424.0, + "33": 638631424.0, + "34": 638631424.0, + "35": 638631424.0, + "36": 638631424.0, + "37": 638631424.0, + "38": 638631424.0, + "39": 638631424.0, + "40": 638631424.0, + "41": 638631424.0, + "42": 638631424.0, + "43": 638631424.0, + "44": 638631424.0, + "45": 638631424.0, + "46": 638631424.0, + "47": 638631424.0, + "48": 638631424.0, + "49": 638631424.0, + "50": 638631424.0, + "51": 638631424.0, + "52": 638631424.0, + "53": 638631424.0, + "54": 638631424.0, + "55": 638631424.0, + "56": 638631424.0, + "57": 638631424.0, + "58": 638631424.0, + "59": 638631424.0, + "60": 638631424.0, + "61": 638631424.0, + "62": 638631424.0, + "63": 638631424.0, + "64": 638631424.0, + "65": 638631424.0, + "66": 638631424.0, + "67": 638631424.0, + "68": 638631424.0, + "69": 638631424.0, + "70": 638631424.0, + "71": 638631424.0, + "72": 638631424.0, + "73": 638631424.0, + "74": 638631424.0, + "75": 638631424.0, + "76": 638631424.0, + "77": 638631424.0, + "78": 638631424.0, + "79": 638631424.0, + "80": 638631424.0, + "81": 638631424.0, + "82": 638631424.0, + "83": 638631424.0, + "84": 638631424.0, + "85": 638631424.0, + "86": 638631424.0, + "87": 638631424.0, + "88": 638631424.0, + "89": 638631424.0, + "90": 638631424.0, + "91": 638631424.0, + "92": 638631424.0, + "93": 638631424.0, + "94": 638631424.0, + "95": 638631424.0, + "96": 638631424.0, + "97": 638631424.0, + "98": 638631424.0, + "99": 638631424.0, + "100": 638631424.0 } }, "mem-max-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 908535808.0, - "2": 1167747584.0, - "3": 1168796160.0, - "4": 1168796160.0, - "5": 1179675136.0, - "6": 1179675136.0, - "7": 1179675136.0, - "8": 1179675136.0, - "9": 1179675136.0, - "10": 1179675136.0, - "11": 1179675136.0, - "12": 1179675136.0, - "13": 1179675136.0, - "14": 1179675136.0, - "15": 1179675136.0, - "16": 1179675136.0, - "17": 1180330496.0, - "18": 1180330496.0, - "19": 1180330496.0, - "20": 1180330496.0, - "21": 1180330496.0, - "22": 1180330496.0, - "23": 1180330496.0, - "24": 1180330496.0, - "25": 1180330496.0, - "26": 1180330496.0, - "27": 1180330496.0, - "28": 1180330496.0, - "29": 1180330496.0, - "30": 1180330496.0, - "31": 1180330496.0, - "32": 1180330496.0, - "33": 1180330496.0, - "34": 1180330496.0, - "35": 1180330496.0, - "36": 1180330496.0, - "37": 1180330496.0, - "38": 1180330496.0, - "39": 1180330496.0, - "40": 1180330496.0, - "41": 1180330496.0, - "42": 1180330496.0, - "43": 1180330496.0, - "44": 1180330496.0, - "45": 1180330496.0, - "46": 1180330496.0, - "47": 1180330496.0, - "48": 1180330496.0, - "49": 1180330496.0, - "50": 1180330496.0, - "51": 1180330496.0, - "52": 1180330496.0, - "53": 1180330496.0, - "54": 1180330496.0, - "55": 1180330496.0, - "56": 1180330496.0, - "57": 1180330496.0, - "58": 1180330496.0, - "59": 1180330496.0, - "60": 1180330496.0, - "61": 1180330496.0, - "62": 1180330496.0, - "63": 1180330496.0, - "64": 1180330496.0, - "65": 1180330496.0, - "66": 1180330496.0, - "67": 1180330496.0, - "68": 1180330496.0, - "69": 1180330496.0, - "70": 1180330496.0, - "71": 1180330496.0, - "72": 1180330496.0, - "73": 1180330496.0, - "74": 1180330496.0, - "75": 1180330496.0, - "76": 1180330496.0, - "77": 1180330496.0, - "78": 1180330496.0, - "79": 1180330496.0, - "80": 1180330496.0, - "81": 1180330496.0, - "82": 1180330496.0, - "83": 1180330496.0, - "84": 1180330496.0, - "85": 1180330496.0, - "86": 1180330496.0, - "87": 1180330496.0, - "88": 1180330496.0, - "89": 1180330496.0, - "90": 1180330496.0, - "91": 1180330496.0, - "92": 1180330496.0, - "93": 1180330496.0, - "94": 1180330496.0, - "95": 1180330496.0, - "96": 1180330496.0, - "97": 1180330496.0, - "98": 1180330496.0, - "99": 1180330496.0, - "100": 1180330496.0 + "1": 910633472.0, + "2": 1170499584.0, + "3": 1170499584.0, + "4": 1170499584.0, + "5": 1170499584.0, + "6": 1170499584.0, + "7": 1170500096.0, + "8": 1170500096.0, + "9": 1170500096.0, + "10": 1170500096.0, + "11": 1170761728.0, + "12": 1170761728.0, + "13": 1170761728.0, + "14": 1173644800.0, + "15": 1173644800.0, + "16": 1178888192.0, + "17": 1178888192.0, + "18": 1178888192.0, + "19": 1178888192.0, + "20": 1178888192.0, + "21": 1178888192.0, + "22": 1178888192.0, + "23": 1178888192.0, + "24": 1178888192.0, + "25": 1178888192.0, + "26": 1178888192.0, + "27": 1178888192.0, + "28": 1178888192.0, + "29": 1178888192.0, + "30": 1178888192.0, + "31": 1178888192.0, + "32": 1178888192.0, + "33": 1178888192.0, + "34": 1178888192.0, + "35": 1178888192.0, + "36": 1178888192.0, + "37": 1179936768.0, + "38": 1179936768.0, + "39": 1179936768.0, + "40": 1179936768.0, + "41": 1179936768.0, + "42": 1179936768.0, + "43": 1179936768.0, + "44": 1179936768.0, + "45": 1179936768.0, + "46": 1179936768.0, + "47": 1179936768.0, + "48": 1179936768.0, + "49": 1179936768.0, + "50": 1179936768.0, + "51": 1179936768.0, + "52": 1179936768.0, + "53": 1179936768.0, + "54": 1179936768.0, + "55": 1179936768.0, + "56": 1179936768.0, + "57": 1179936768.0, + "58": 1179936768.0, + "59": 1179936768.0, + "60": 1179936768.0, + "61": 1179936768.0, + "62": 1179936768.0, + "63": 1179936768.0, + "64": 1179936768.0, + "65": 1179936768.0, + "66": 1179936768.0, + "67": 1179936768.0, + "68": 1179936768.0, + "69": 1179936768.0, + "70": 1179936768.0, + "71": 1179936768.0, + "72": 1179936768.0, + "73": 1179936768.0, + "74": 1179936768.0, + "75": 1179936768.0, + "76": 1179936768.0, + "77": 1179936768.0, + "78": 1179936768.0, + "79": 1179936768.0, + "80": 1179936768.0, + "81": 1179936768.0, + "82": 1179936768.0, + "83": 1179936768.0, + "84": 1179936768.0, + "85": 1179936768.0, + "86": 1179936768.0, + "87": 1179936768.0, + "88": 1179936768.0, + "89": 1179936768.0, + "90": 1179936768.0, + "91": 1179936768.0, + "92": 1179936768.0, + "93": 1179936768.0, + "94": 1179936768.0, + "95": 1179936768.0, + "96": 1179936768.0, + "97": 1179936768.0, + "98": 1179936768.0, + "99": 1180984832.0, + "100": 1180984832.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 9.2723, - "2": 0.3877, - "3": 0.37645, - "4": 0.36551, - "5": 0.37045, - "6": 0.36893, - "7": 0.36938, - "8": 0.36753, - "9": 0.36888, - "10": 0.43135, - "11": 0.36252, - "12": 0.37084, - "13": 0.365, - "14": 0.36033, - "15": 0.35887, - "16": 0.36201, - "17": 0.53142, - "18": 0.36699, - "19": 0.36318, - "20": 0.36321, - "21": 0.36209, - "22": 0.72283, - "23": 0.3641, - "24": 0.36359, - "25": 0.36227, - "26": 0.36731, - "27": 0.36879, - "28": 0.36963, - "29": 0.37051, - "30": 0.36794, - "31": 0.37079, - "32": 0.368, - "33": 0.44096, - "34": 0.52072, - "35": 0.48704, - "36": 0.4152, - "37": 0.37792, - "38": 0.37304, - "39": 0.37505, - "40": 0.37438, - "41": 0.3737, - "42": 0.37569, - "43": 0.37181, - "44": 0.37336, - "45": 0.3731, - "46": 0.37229, - "47": 0.37374, - "48": 0.37375, - "49": 0.3719, - "50": 0.37298, - "51": 0.3797, - "52": 0.36304, - "53": 0.36729, - "54": 0.36756, - "55": 0.37134, - "56": 0.37139, - "57": 0.37112, - "58": 0.38383, - "59": 0.3916, - "60": 0.37403, - "61": 0.37341, - "62": 0.37078, - "63": 0.37095, - "64": 0.37149, - "65": 0.37269, - "66": 0.3736, - "67": 0.37255, - "68": 0.36695, - "69": 0.37351, - "70": 0.37443, - "71": 0.3726, - "72": 0.3731, - "73": 0.37353, - "74": 0.3737, - "75": 0.373, - "76": 0.36094, - "77": 0.36374, - "78": 0.36366, - "79": 0.36446, - "80": 0.36414, - "81": 0.36245, - "82": 0.3641, - "83": 0.3627, - "84": 0.36487, - "85": 0.36027, - "86": 0.3602, - "87": 0.3611, - "88": 0.36555, - "89": 0.36571, - "90": 0.36479, - "91": 0.36175, - "92": 0.36215, - "93": 0.36421, - "94": 0.36147, - "95": 0.36348, - "96": 0.36311, - "97": 0.36282, - "98": 0.38328, - "99": 0.40994, - "100": 0.36791 + "1": "nan", + "2": 2.94981, + "3": 0.44844, + "4": 0.45032, + "5": 0.45007, + "6": 0.45413, + "7": 0.45595, + "8": 0.46147, + "9": 0.45703, + "10": 0.45713, + "11": 0.45843, + "12": 0.46481, + "13": 0.46367, + "14": 0.46145, + "15": 0.46144, + "16": 0.45944, + "17": 0.46053, + "18": 0.46234, + "19": 0.46318, + "20": 0.46597, + "21": 0.46872, + "22": 0.46167, + "23": 0.46295, + "24": 0.46293, + "25": 0.4609, + "26": 0.46534, + "27": 0.46202, + "28": 0.46538, + "29": 0.45857, + "30": 0.45499, + "31": 0.45555, + "32": 0.45778, + "33": 0.45371, + "34": 0.4591, + "35": 0.45853, + "36": 0.46033, + "37": 0.46964, + "38": 0.46372, + "39": 0.47429, + "40": 0.4563, + "41": 0.45921, + "42": 0.46933, + "43": 0.45644, + "44": 0.46471, + "45": 0.4574, + "46": 0.46247, + "47": 0.45727, + "48": 0.45962, + "49": 0.45179, + "50": 0.45444, + "51": 0.4599, + "52": 0.44679, + "53": 0.45022, + "54": 0.45041, + "55": 0.45771, + "56": 0.45328, + "57": 0.45098, + "58": 0.44748, + "59": 0.44807, + "60": 0.4538, + "61": 0.45222, + "62": 0.44954, + "63": 0.44907, + "64": 0.45008, + "65": 0.44883, + "66": 0.4485, + "67": 0.44967, + "68": 0.45395, + "69": 0.45369, + "70": 0.45227, + "71": 0.45433, + "72": 0.45362, + "73": 0.45783, + "74": 0.45269, + "75": 0.45513, + "76": 0.45076, + "77": 0.4512, + "78": 0.4499, + "79": 0.45799, + "80": 0.45507, + "81": 0.45882, + "82": 0.46542, + "83": 0.45653, + "84": 0.46726, + "85": 0.47932, + "86": 0.45972, + "87": 0.46195, + "88": 0.46285, + "89": 0.46098, + "90": 0.46499, + "91": 0.46284, + "92": 0.46284, + "93": 0.45889, + "94": 0.45485, + "95": 0.45165, + "96": 0.45389, + "97": 0.45854, + "98": 0.45665, + "99": 0.46287, + "100": 0.47613 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..21e65ea8685 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.78575, + "52": 9.67079, + "53": 10.00718, + "54": 9.897, + "55": 9.82613, + "56": 9.54826, + "57": 9.40078, + "58": 9.77416, + "59": 9.51683, + "60": 9.42721, + "61": 9.63407, + "62": 9.93885, + "63": 9.30502, + "64": 9.71263, + "65": 8.86836, + "66": 9.64475, + "67": 9.31349, + "68": 9.73448, + "69": 9.75501, + "70": 9.68613, + "71": 9.57698, + "72": 9.53067, + "73": 9.43091, + "74": 8.85477, + "75": 9.35819, + "76": 9.01446, + "77": 10.02647, + "78": 9.68112, + "79": 9.33348, + "80": 9.35484, + "81": 9.44135, + "82": 9.66189, + "83": 9.2631, + "84": 9.37182, + "85": 9.57428, + "86": 9.03438, + "87": 9.56188, + "88": 9.7128, + "89": 9.55803, + "90": 9.79197, + "91": 9.2902, + "92": 9.31613, + "93": 9.04053, + "94": 8.78282, + "95": 9.49399, + "96": 9.48876, + "97": 9.2605, + "98": 9.6313, + "99": 8.85096, + "100": 9.36491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 930.0, + "52": 935.0, + "53": 1000.0, + "54": 919.0, + "55": 868.0, + "56": 1000.0, + "57": 827.0, + "58": 1023.0, + "59": 1019.0, + "60": 876.0, + "61": 1017.0, + "62": 936.0, + "63": 963.0, + "64": 1082.0, + "65": 982.0, + "66": 1037.0, + "67": 986.0, + "68": 1083.0, + "69": 1055.0, + "70": 1040.0, + "71": 999.0, + "72": 883.0, + "73": 1019.0, + "74": 728.0, + "75": 847.0, + "76": 1083.0, + "77": 1150.0, + "78": 1105.0, + "79": 1071.0, + "80": 1139.0, + "81": 1195.0, + "82": 1064.0, + "83": 1012.0, + "84": 1105.0, + "85": 1121.0, + "86": 836.0, + "87": 1193.0, + "88": 1096.0, + "89": 1116.0, + "90": 1162.0, + "91": 1098.0, + "92": 1160.0, + "93": 906.0, + "94": 1177.0, + "95": 1117.0, + "96": 1232.0, + "97": 1115.0, + "98": 1241.0, + "99": 1032.0, + "100": 1132.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 640336384.0, + "52": 640336384.0, + "53": 640336384.0, + "54": 640336384.0, + "55": 640336384.0, + "56": 640336384.0, + "57": 640336384.0, + "58": 640336384.0, + "59": 640336384.0, + "60": 640336384.0, + "61": 640336384.0, + "62": 640336384.0, + "63": 640336384.0, + "64": 640336384.0, + "65": 640336384.0, + "66": 640336384.0, + "67": 640336384.0, + "68": 640336384.0, + "69": 640336384.0, + "70": 640336384.0, + "71": 640336384.0, + "72": 640336384.0, + "73": 640336384.0, + "74": 640336384.0, + "75": 640336384.0, + "76": 640336384.0, + "77": 640336384.0, + "78": 640336384.0, + "79": 640336384.0, + "80": 640336384.0, + "81": 640336384.0, + "82": 640336384.0, + "83": 640336384.0, + "84": 640336384.0, + "85": 640336384.0, + "86": 640336384.0, + "87": 640336384.0, + "88": 640336384.0, + "89": 640336384.0, + "90": 640336384.0, + "91": 640336384.0, + "92": 640336384.0, + "93": 640336384.0, + "94": 640336384.0, + "95": 640336384.0, + "96": 640336384.0, + "97": 640336384.0, + "98": 640336384.0, + "99": 640336384.0, + "100": 640336384.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1171155456.0, + "52": 1174302208.0, + "53": 1176398848.0, + "54": 1176398848.0, + "55": 1176398848.0, + "56": 1176398848.0, + "57": 1176398848.0, + "58": 1176398848.0, + "59": 1178496512.0, + "60": 1178496512.0, + "61": 1178496512.0, + "62": 1178496512.0, + "63": 1178496512.0, + "64": 1178496512.0, + "65": 1178496512.0, + "66": 1178496512.0, + "67": 1178496512.0, + "68": 1178496512.0, + "69": 1178496512.0, + "70": 1178496512.0, + "71": 1178496512.0, + "72": 1178496512.0, + "73": 1178496512.0, + "74": 1178496512.0, + "75": 1178496512.0, + "76": 1178496512.0, + "77": 1178496512.0, + "78": 1178496512.0, + "79": 1178496512.0, + "80": 1178496512.0, + "81": 1178496512.0, + "82": 1178496512.0, + "83": 1178496512.0, + "84": 1178496512.0, + "85": 1178496512.0, + "86": 1178496512.0, + "87": 1178496512.0, + "88": 1178496512.0, + "89": 1178496512.0, + "90": 1178496512.0, + "91": 1178496512.0, + "92": 1178496512.0, + "93": 1178496512.0, + "94": 1178496512.0, + "95": 1178496512.0, + "96": 1178496512.0, + "97": 1178496512.0, + "98": 1178496512.0, + "99": 1178496512.0, + "100": 1178496512.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.54283, + "53": 0.43318, + "54": 0.42496, + "55": 0.42557, + "56": 0.42913, + "57": 0.44479, + "58": 0.45904, + "59": 0.46949, + "60": 0.46125, + "61": 0.46841, + "62": 0.4736, + "63": 0.46017, + "64": 0.45223, + "65": 0.45568, + "66": 0.44984, + "67": 0.44794, + "68": 0.45062, + "69": 0.45415, + "70": 0.46315, + "71": 0.45069, + "72": 0.45122, + "73": 0.45026, + "74": 0.44997, + "75": 0.44929, + "76": 0.45314, + "77": 0.45848, + "78": 0.4566, + "79": 0.45909, + "80": 0.46265, + "81": 0.4592, + "82": 0.47898, + "83": 0.47817, + "84": 0.46757, + "85": 0.46663, + "86": 0.46924, + "87": 0.48331, + "88": 0.46217, + "89": 0.4596, + "90": 0.45471, + "91": 0.45598, + "92": 0.45849, + "93": 0.4626, + "94": 0.46398, + "95": 0.45663, + "96": 0.45814, + "97": 0.45394, + "98": 0.45984, + "99": 0.47284, + "100": 0.46707 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json index 8b51d66847b..52d95069cff 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 6.76973, - "2": 0.17585, - "3": 0.17368, - "4": 0.16152, - "5": 0.19039, - "6": 0.22444, - "7": 0.19405, - "8": 0.19945, - "9": 0.19849, - "10": 0.19715, - "11": 0.26257, - "12": 0.20383, - "13": 0.20656, - "14": 0.16788, - "15": 0.16036, - "16": 0.16063, - "17": 0.28798, - "18": 0.16008, - "19": 0.15785, - "20": 0.15974, - "21": 0.15889, - "22": 0.15943, - "23": 0.15886, - "24": 0.16021, - "25": 0.15915, - "26": 0.16121, - "27": 0.15965, - "28": 0.15981, - "29": 0.16011, - "30": 0.15997, - "31": 0.16048, - "32": 0.15884, - "33": 0.16058, - "34": 0.15945, - "35": 0.15917, - "36": 0.16205, - "37": 0.15947, - "38": 0.16161, - "39": 0.15927, - "40": 0.15876, - "41": 0.159, - "42": 0.47609, - "43": 0.17027, - "44": 0.1644, - "45": 0.16303, - "46": 0.16036, - "47": 0.16029, - "48": 0.16095, - "49": 0.16015, - "50": 0.1603, - "51": 0.21916, - "52": 0.20178, - "53": 0.20344, - "54": 0.22444, - "55": 0.25106, - "56": 0.19763, - "57": 0.21076, - "58": 0.24116, - "59": 0.19345, - "60": 0.1603, - "61": 0.15954, - "62": 0.16062, - "63": 0.20422, - "64": 0.1605, - "65": 0.16211, - "66": 0.16077, - "67": 0.16024, - "68": 0.16099, - "69": 0.16333, - "70": 0.16439, - "71": 0.16108, - "72": 0.16247, - "73": 0.1611, - "74": 0.16235, - "75": 0.16292, - "76": 0.16349, - "77": 0.1636, - "78": 0.16363, - "79": 0.34343, - "80": 0.15998, - "81": 0.15954, - "82": 0.15941, - "83": 0.15965, - "84": 0.16027, - "85": 0.16164, - "86": 0.16113, - "87": 0.16126, - "88": 0.16032, - "89": 0.26526, - "90": 0.15925, - "91": 0.1601, - "92": 0.15972, - "93": 0.15947, - "94": 0.15955, - "95": 0.15981, - "96": 0.15971, - "97": 0.15989, - "98": 0.15959, - "99": 0.15994, - "100": 0.16111 + "1": "nan", + "2": 3.23604, + "3": 0.17137, + "4": 0.15759, + "5": 0.15886, + "6": 0.15986, + "7": 0.16006, + "8": 0.16032, + "9": 0.15956, + "10": 0.15866, + "11": 0.16034, + "12": 0.161, + "13": 0.16092, + "14": 0.16138, + "15": 0.16079, + "16": 0.16106, + "17": 0.16054, + "18": 0.16039, + "19": 0.15987, + "20": 0.1604, + "21": 0.1606, + "22": 0.1605, + "23": 0.16063, + "24": 0.16081, + "25": 0.16081, + "26": 0.16009, + "27": 0.16063, + "28": 0.16056, + "29": 0.16163, + "30": 0.16078, + "31": 0.16052, + "32": 0.16157, + "33": 0.16141, + "34": 0.1609, + "35": 0.16067, + "36": 0.164, + "37": 0.16064, + "38": 0.16086, + "39": 0.16108, + "40": 0.1619, + "41": 0.15987, + "42": 0.16141, + "43": 0.16345, + "44": 0.15987, + "45": 0.16151, + "46": 0.16073, + "47": 0.16034, + "48": 0.15782, + "49": 0.15892, + "50": 0.15976, + "51": 0.17905, + "52": 0.16268, + "53": 0.15809, + "54": 0.15783, + "55": 0.1601, + "56": 0.16197, + "57": 0.16434, + "58": 0.16544, + "59": 0.16658, + "60": 0.16487, + "61": 0.16473, + "62": 0.1655, + "63": 0.16592, + "64": 0.1663, + "65": 0.16721, + "66": 0.16644, + "67": 0.16736, + "68": 0.16682, + "69": 0.16612, + "70": 0.1673, + "71": 0.1652, + "72": 0.16908, + "73": 0.16732, + "74": 0.16542, + "75": 0.16546, + "76": 0.16719, + "77": 0.16547, + "78": 0.16719, + "79": 0.16664, + "80": 0.16167, + "81": 0.163, + "82": 0.16309, + "83": 0.16502, + "84": 0.16364, + "85": 0.1648, + "86": 0.16491, + "87": 0.16514, + "88": 0.16442, + "89": 0.16591, + "90": 0.16301, + "91": 0.16462, + "92": 0.16639, + "93": 0.16358, + "94": 0.16489, + "95": 0.16504, + "96": 0.16457, + "97": 0.163, + "98": 0.16359, + "99": 0.16433, + "100": 0.16527 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..a3bcbb68249 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.85898, + "52": 9.75133, + "53": 10.06617, + "54": 9.95613, + "55": 9.89104, + "56": 9.62508, + "57": 9.47981, + "58": 9.83478, + "59": 9.58498, + "60": 9.49806, + "61": 9.69192, + "62": 9.98825, + "63": 9.37824, + "64": 9.76808, + "65": 8.94514, + "66": 9.70125, + "67": 9.37149, + "68": 9.78313, + "69": 9.79923, + "70": 9.7312, + "71": 9.62753, + "72": 9.58452, + "73": 9.48417, + "74": 8.92523, + "75": 9.4118, + "76": 9.0796, + "77": 10.06083, + "78": 9.7215, + "79": 9.38109, + "80": 9.40161, + "81": 9.48468, + "82": 9.70219, + "83": 9.31549, + "84": 9.41786, + "85": 9.61785, + "86": 9.077, + "87": 9.59967, + "88": 9.75356, + "89": 9.60341, + "90": 9.82789, + "91": 9.33668, + "92": 9.36036, + "93": 9.08765, + "94": 8.83052, + "95": 9.5296, + "96": 9.53024, + "97": 9.30627, + "98": 9.67298, + "99": 8.89917, + "100": 9.40828 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2554.0, + "52": 2466.0, + "53": 2923.0, + "54": 2612.0, + "55": 2351.0, + "56": 2757.0, + "57": 2313.0, + "58": 2798.0, + "59": 2750.0, + "60": 2376.0, + "61": 2848.0, + "62": 2668.0, + "63": 2468.0, + "64": 2818.0, + "65": 2630.0, + "66": 2992.0, + "67": 2802.0, + "68": 2794.0, + "69": 2851.0, + "70": 3059.0, + "71": 2869.0, + "72": 2424.0, + "73": 3035.0, + "74": 2113.0, + "75": 2485.0, + "76": 2782.0, + "77": 3252.0, + "78": 3149.0, + "79": 3192.0, + "80": 3229.0, + "81": 3397.0, + "82": 3297.0, + "83": 2766.0, + "84": 3192.0, + "85": 3206.0, + "86": 2648.0, + "87": 3709.0, + "88": 2962.0, + "89": 3273.0, + "90": 3149.0, + "91": 2825.0, + "92": 3047.0, + "93": 2918.0, + "94": 3432.0, + "95": 3266.0, + "96": 3574.0, + "97": 3190.0, + "98": 3564.0, + "99": 2977.0, + "100": 3249.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 492274176.0, + "52": 492274176.0, + "53": 492274176.0, + "54": 492274176.0, + "55": 492274176.0, + "56": 492274176.0, + "57": 492274176.0, + "58": 492274176.0, + "59": 492274176.0, + "60": 492274176.0, + "61": 492274176.0, + "62": 492274176.0, + "63": 492274176.0, + "64": 492274176.0, + "65": 492274176.0, + "66": 492274176.0, + "67": 492274176.0, + "68": 492274176.0, + "69": 492274176.0, + "70": 492274176.0, + "71": 492274176.0, + "72": 492274176.0, + "73": 492274176.0, + "74": 492274176.0, + "75": 492274176.0, + "76": 492274176.0, + "77": 492274176.0, + "78": 492274176.0, + "79": 492274176.0, + "80": 492274176.0, + "81": 492274176.0, + "82": 492274176.0, + "83": 492274176.0, + "84": 492274176.0, + "85": 492274176.0, + "86": 492274176.0, + "87": 492274176.0, + "88": 492274176.0, + "89": 492274176.0, + "90": 492274176.0, + "91": 492274176.0, + "92": 492274176.0, + "93": 492274176.0, + "94": 492274176.0, + "95": 492274176.0, + "96": 492274176.0, + "97": 492274176.0, + "98": 492274176.0, + "99": 492274176.0, + "100": 492274176.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1709489664.0, + "52": 1709490688.0, + "53": 1709490688.0, + "54": 1709490688.0, + "55": 1709490688.0, + "56": 1709490688.0, + "57": 1709490688.0, + "58": 1709490688.0, + "59": 1709490688.0, + "60": 1709490688.0, + "61": 1709490688.0, + "62": 1709490688.0, + "63": 1709490688.0, + "64": 1709490688.0, + "65": 1709490688.0, + "66": 1709490688.0, + "67": 1709490688.0, + "68": 1709490688.0, + "69": 1709490688.0, + "70": 1709490688.0, + "71": 1709490688.0, + "72": 1709490688.0, + "73": 1709490688.0, + "74": 1709490688.0, + "75": 1709490688.0, + "76": 1709490688.0, + "77": 1709490688.0, + "78": 1709490688.0, + "79": 1709490688.0, + "80": 1709490688.0, + "81": 1709490688.0, + "82": 1709490688.0, + "83": 1709490688.0, + "84": 1709490688.0, + "85": 1709490688.0, + "86": 1709490688.0, + "87": 1709490688.0, + "88": 1709490688.0, + "89": 1709490688.0, + "90": 1709490688.0, + "91": 1709490688.0, + "92": 1709490688.0, + "93": 1709490688.0, + "94": 1709490688.0, + "95": 1709490688.0, + "96": 1709490688.0, + "97": 1709490688.0, + "98": 1709490688.0, + "99": 1709490688.0, + "100": 1709490688.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 3.02213, + "53": 0.17862, + "54": 0.16745, + "55": 0.16648, + "56": 0.16673, + "57": 0.16292, + "58": 0.19638, + "59": 0.16818, + "60": 0.16539, + "61": 0.16364, + "62": 0.16301, + "63": 0.1601, + "64": 0.16122, + "65": 0.16293, + "66": 0.16244, + "67": 0.16253, + "68": 0.16237, + "69": 0.16026, + "70": 0.17045, + "71": 0.15999, + "72": 0.1709, + "73": 0.16315, + "74": 0.1602, + "75": 0.15985, + "76": 0.15963, + "77": 0.15943, + "78": 0.15987, + "79": 0.16, + "80": 0.16033, + "81": 0.16099, + "82": 0.16037, + "83": 0.16139, + "84": 0.16563, + "85": 0.16553, + "86": 0.16519, + "87": 0.16488, + "88": 0.16176, + "89": 0.16185, + "90": 0.16148, + "91": 0.16234, + "92": 0.1601, + "93": 0.16147, + "94": 0.16081, + "95": 0.16284, + "96": 0.16144, + "97": 0.16225, + "98": 0.16162, + "99": 0.16044, + "100": 0.16202 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json index 0c5b41565c8..9066ef0c241 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 463110656.0, + "2": 463110656.0, + "3": 463110656.0, + "4": 463110656.0, + "5": 463110656.0, + "6": 463110656.0, + "7": 463110656.0, + "8": 463110656.0, + "9": 463110656.0, + "10": 463110656.0, + "11": 463110656.0, + "12": 463110656.0, + "13": 463110656.0, + "14": 463110656.0, + "15": 463110656.0, + "16": 463110656.0, + "17": 463110656.0, + "18": 463110656.0, + "19": 463110656.0, + "20": 463110656.0, + "21": 463110656.0, + "22": 463110656.0, + "23": 463110656.0, + "24": 463110656.0, + "25": 463110656.0, + "26": 463110656.0, + "27": 463110656.0, + "28": 463110656.0, + "29": 463110656.0, + "30": 463110656.0, + "31": 463110656.0, + "32": 463110656.0, + "33": 463110656.0, + "34": 463110656.0, + "35": 463110656.0, + "36": 463110656.0, + "37": 463110656.0, + "38": 463110656.0, + "39": 463110656.0, + "40": 463110656.0, + "41": 463110656.0, + "42": 463110656.0, + "43": 463110656.0, + "44": 463110656.0, + "45": 463110656.0, + "46": 463110656.0, + "47": 463110656.0, + "48": 463110656.0, + "49": 463110656.0, + "50": 463110656.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 708779008.0, - "2": 882038272.0, - "3": 882562560.0, - "4": 882562560.0, - "5": 882562560.0, - "6": 882562560.0, - "7": 882562560.0, - "8": 882562560.0, - "9": 882562560.0, - "10": 882562560.0, - "11": 882562560.0, - "12": 882562560.0, - "13": 882562560.0, - "14": 882562560.0, - "15": 882562560.0, - "16": 882562560.0, - "17": 882562560.0, - "18": 882562560.0, - "19": 882562560.0, - "20": 882562560.0, - "21": 882562560.0, - "22": 882562560.0, - "23": 882562560.0, - "24": 882562560.0, - "25": 882562560.0, - "26": 882562560.0, - "27": 882562560.0, - "28": 883608576.0, - "29": 883608576.0, - "30": 883608576.0, - "31": 883608576.0, - "32": 883608576.0, - "33": 883608576.0, - "34": 883608576.0, - "35": 883608576.0, - "36": 883608576.0, - "37": 883608576.0, - "38": 883608576.0, - "39": 883608576.0, - "40": 883608576.0, - "41": 883608576.0, - "42": 883608576.0, - "43": 883608576.0, - "44": 883608576.0, - "45": 883608576.0, - "46": 883608576.0, - "47": 883608576.0, - "48": 883608576.0, - "49": 883608576.0, - "50": 883608576.0 + "1": 704587264.0, + "2": 887671296.0, + "3": 887671296.0, + "4": 887671296.0, + "5": 887671296.0, + "6": 887671296.0, + "7": 887671296.0, + "8": 887671296.0, + "9": 887671296.0, + "10": 887671296.0, + "11": 887674368.0, + "12": 887674368.0, + "13": 887674368.0, + "14": 887674368.0, + "15": 887674368.0, + "16": 887674368.0, + "17": 887674368.0, + "18": 887674368.0, + "19": 887674368.0, + "20": 887674368.0, + "21": 887674368.0, + "22": 887674368.0, + "23": 887674368.0, + "24": 887674368.0, + "25": 887674368.0, + "26": 887674368.0, + "27": 887674368.0, + "28": 887674368.0, + "29": 887674368.0, + "30": 887674368.0, + "31": 887674368.0, + "32": 887674368.0, + "33": 887674368.0, + "34": 887674368.0, + "35": 887674368.0, + "36": 887674368.0, + "37": 887674368.0, + "38": 887674368.0, + "39": 887674368.0, + "40": 887674368.0, + "41": 887674368.0, + "42": 887674368.0, + "43": 887674368.0, + "44": 887674368.0, + "45": 887674368.0, + "46": 887674368.0, + "47": 887674368.0, + "48": 887674368.0, + "49": 887674368.0, + "50": 887674368.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.32961, - "2": 0.54797, - "3": 0.51657, - "4": 0.52599, - "5": 0.61023, - "6": 0.69053, - "7": 0.5446, - "8": 0.51966, - "9": 0.52377, - "10": 0.52901, - "11": 0.52742, - "12": 0.53394, - "13": 0.52346, - "14": 0.52257, - "15": 0.51751, - "16": 0.48338, - "17": 0.48757, - "18": 0.52092, - "19": 0.49857, - "20": 0.49815, - "21": 0.49063, - "22": 0.49632, - "23": 0.4849, - "24": 0.49986, - "25": 0.48483, - "26": 0.49826, - "27": 0.48315, - "28": 0.4875, - "29": 0.498, - "30": 0.49611, - "31": 0.4984, - "32": 0.5284, - "33": 0.50276, - "34": 0.49132, - "35": 0.49787, - "36": 0.4947, - "37": 0.48747, - "38": 0.4952, - "39": 0.49214, - "40": 0.49151, - "41": 0.49593, - "42": 0.49285, - "43": 0.49745, - "44": 0.48784, - "45": 0.51195, - "46": 0.53565, - "47": 0.53921, - "48": 0.53697, - "49": 0.5397, - "50": 0.55869 + "1": "nan", + "2": 3.90343, + "3": 0.46196, + "4": 0.44553, + "5": 0.44562, + "6": 0.4419, + "7": 0.44421, + "8": 0.44706, + "9": 0.44217, + "10": 0.44007, + "11": 0.42363, + "12": 0.42376, + "13": 0.41997, + "14": 0.42079, + "15": 0.49345, + "16": 0.51617, + "17": 0.52444, + "18": 0.52822, + "19": 0.53053, + "20": 0.52743, + "21": 0.5289, + "22": 0.52655, + "23": 0.52934, + "24": 0.52619, + "25": 0.52343, + "26": 0.52071, + "27": 0.50241, + "28": 0.48165, + "29": 0.47009, + "30": 0.46549, + "31": 0.46432, + "32": 0.47167, + "33": 0.53326, + "34": 0.49042, + "35": 0.46143, + "36": 0.45859, + "37": 0.45093, + "38": 0.45152, + "39": 0.4443, + "40": 0.44393, + "41": 0.43638, + "42": 0.44204, + "43": 0.43923, + "44": 0.44115, + "45": 0.44017, + "46": 0.47447, + "47": 0.49352, + "48": 0.4943, + "49": 0.49375, + "50": 0.49263 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json index b9bbabe5437..fea64911d52 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 460096000.0, - "2": 460096000.0, - "3": 460096000.0, - "4": 460096000.0, - "5": 460096000.0, - "6": 460096000.0, - "7": 460096000.0, - "8": 460096000.0, - "9": 460096000.0, - "10": 460096000.0, - "11": 460096000.0, - "12": 460096000.0, - "13": 460096000.0, - "14": 460096000.0, - "15": 460096000.0, - "16": 460096000.0, - "17": 460096000.0, - "18": 460096000.0, - "19": 460096000.0, - "20": 460096000.0, - "21": 460096000.0, - "22": 460096000.0, - "23": 460096000.0, - "24": 460096000.0, - "25": 460096000.0, - "26": 460096000.0, - "27": 460096000.0, - "28": 460096000.0, - "29": 460096000.0, - "30": 460096000.0, - "31": 460096000.0, - "32": 460096000.0, - "33": 460096000.0, - "34": 460096000.0, - "35": 460096000.0, - "36": 460096000.0, - "37": 460096000.0, - "38": 460096000.0, - "39": 460096000.0, - "40": 460096000.0, - "41": 460096000.0, - "42": 460096000.0, - "43": 460096000.0, - "44": 460096000.0, - "45": 460096000.0, - "46": 460096000.0, - "47": 460096000.0, - "48": 460096000.0, - "49": 460096000.0, - "50": 460096000.0 + "1": 463110656.0, + "2": 463110656.0, + "3": 463110656.0, + "4": 463110656.0, + "5": 463110656.0, + "6": 463110656.0, + "7": 463110656.0, + "8": 463110656.0, + "9": 463110656.0, + "10": 463110656.0, + "11": 463110656.0, + "12": 463110656.0, + "13": 463110656.0, + "14": 463110656.0, + "15": 463110656.0, + "16": 463110656.0, + "17": 463110656.0, + "18": 463110656.0, + "19": 463110656.0, + "20": 463110656.0, + "21": 463110656.0, + "22": 463110656.0, + "23": 463110656.0, + "24": 463110656.0, + "25": 463110656.0, + "26": 463110656.0, + "27": 463110656.0, + "28": 463110656.0, + "29": 463110656.0, + "30": 463110656.0, + "31": 463110656.0, + "32": 463110656.0, + "33": 463110656.0, + "34": 463110656.0, + "35": 463110656.0, + "36": 463110656.0, + "37": 463110656.0, + "38": 463110656.0, + "39": 463110656.0, + "40": 463110656.0, + "41": 463110656.0, + "42": 463110656.0, + "43": 463110656.0, + "44": 463110656.0, + "45": 463110656.0, + "46": 463110656.0, + "47": 463110656.0, + "48": 463110656.0, + "49": 463110656.0, + "50": 463110656.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 704587264.0, - "2": 885184000.0, - "3": 885184000.0, - "4": 885184000.0, - "5": 885184000.0, - "6": 885184000.0, - "7": 886231040.0, - "8": 886231552.0, - "9": 886231552.0, - "10": 886231552.0, - "11": 886231552.0, - "12": 886231552.0, - "13": 886231552.0, - "14": 886231552.0, - "15": 886231552.0, - "16": 886231552.0, - "17": 886231552.0, - "18": 886231552.0, - "19": 886231552.0, - "20": 886231552.0, - "21": 886231552.0, - "22": 886231552.0, - "23": 886231552.0, - "24": 886231552.0, - "25": 886231552.0, - "26": 886231552.0, - "27": 886232064.0, - "28": 886232064.0, - "29": 886232064.0, - "30": 886232064.0, - "31": 886232064.0, - "32": 886232064.0, - "33": 886232064.0, - "34": 886232064.0, - "35": 886232064.0, - "36": 886232064.0, - "37": 886232064.0, - "38": 886232064.0, - "39": 886232064.0, - "40": 886232064.0, - "41": 886232064.0, - "42": 886232064.0, - "43": 886232064.0, - "44": 886232064.0, - "45": 886232064.0, - "46": 886232064.0, - "47": 886232064.0, - "48": 886232064.0, - "49": 886232064.0, - "50": 886232064.0 + "1": 707730944.0, + "2": 887671296.0, + "3": 887671296.0, + "4": 887671296.0, + "5": 887671296.0, + "6": 887671296.0, + "7": 887671296.0, + "8": 887671296.0, + "9": 887671296.0, + "10": 887671296.0, + "11": 887671296.0, + "12": 887671296.0, + "13": 887671296.0, + "14": 887671296.0, + "15": 887671296.0, + "16": 887671296.0, + "17": 887671296.0, + "18": 887671296.0, + "19": 887671296.0, + "20": 887671296.0, + "21": 887671296.0, + "22": 887671296.0, + "23": 887671296.0, + "24": 887671296.0, + "25": 887671296.0, + "26": 887671296.0, + "27": 887671296.0, + "28": 887671296.0, + "29": 887671296.0, + "30": 887671296.0, + "31": 887671296.0, + "32": 887671296.0, + "33": 887671296.0, + "34": 887671296.0, + "35": 887671296.0, + "36": 887671296.0, + "37": 887671296.0, + "38": 887671296.0, + "39": 887671296.0, + "40": 887671296.0, + "41": 887671296.0, + "42": 887671296.0, + "43": 887671296.0, + "44": 887671296.0, + "45": 887671296.0, + "46": 887671296.0, + "47": 887671296.0, + "48": 887671296.0, + "49": 887671296.0, + "50": 887671296.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 13.9895, - "2": 0.51807, - "3": 0.49599, - "4": 0.47064, - "5": 0.48452, - "6": 0.41822, - "7": 0.41485, - "8": 0.4156, - "9": 0.43484, - "10": 0.40847, - "11": 0.5122, - "12": 0.40698, - "13": 0.40749, - "14": 0.49304, - "15": 0.49799, - "16": 0.40895, - "17": 0.41708, - "18": 0.44007, - "19": 0.47716, - "20": 0.47638, - "21": 0.41659, - "22": 0.4125, - "23": 0.41163, - "24": 0.46826, - "25": 0.46402, - "26": 0.42136, - "27": 0.4113, - "28": 0.40612, - "29": 0.61576, - "30": 0.74613, - "31": 0.47263, - "32": 0.48955, - "33": 0.72478, - "34": 0.5927, - "35": 0.6127, - "36": 0.44041, - "37": 0.42799, - "38": 0.46386, - "39": 0.42311, - "40": 0.42142, - "41": 0.42074, - "42": 0.42015, - "43": 0.43664, - "44": 0.41727, - "45": 0.41517, - "46": 0.42041, - "47": 0.58839, - "48": 0.4946, - "49": 0.5046, - "50": 0.50846 + "1": "nan", + "2": 3.59451, + "3": 0.48132, + "4": 0.46344, + "5": 0.46723, + "6": 0.47347, + "7": 0.48756, + "8": 0.49394, + "9": 0.49358, + "10": 0.495, + "11": 0.49567, + "12": 0.49577, + "13": 0.49608, + "14": 0.49247, + "15": 0.49553, + "16": 0.49581, + "17": 0.49335, + "18": 0.5003, + "19": 0.49904, + "20": 0.50095, + "21": 0.49831, + "22": 0.49726, + "23": 0.49738, + "24": 0.50198, + "25": 0.49901, + "26": 0.50161, + "27": 0.50183, + "28": 0.49371, + "29": 0.49579, + "30": 0.49585, + "31": 0.49614, + "32": 0.49424, + "33": 0.49565, + "34": 0.49645, + "35": 0.50022, + "36": 0.50076, + "37": 0.49676, + "38": 0.4972, + "39": 0.49438, + "40": 0.49751, + "41": 0.49485, + "42": 0.49564, + "43": 0.4958, + "44": 0.49763, + "45": 0.49766, + "46": 0.50005, + "47": 0.49885, + "48": 0.50156, + "49": 0.50235, + "50": 0.49766 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json index 8175fe3e6be..bbc822686c6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json @@ -6,54 +6,54 @@ "values": { "1": 10.86244, "2": 10.88582, - "3": 10.84736, - "4": 10.85573, - "5": 10.86003, + "3": 10.84735, + "4": 10.85571, + "5": 10.86002, "6": 10.87733, - "7": 10.8656, - "8": 10.84911, + "7": 10.86562, + "8": 10.84914, "9": 10.86609, - "10": 10.82475, - "11": 10.8562, - "12": 10.85373, - "13": 10.86788, - "14": 10.87111, - "15": 10.8223, - "16": 10.79994, - "17": 10.77431, - "18": 10.78343, - "19": 10.79309, - "20": 10.68225, - "21": 10.64708, - "22": 10.50918, - "23": 10.66826, - "24": 10.54194, - "25": 10.49281, - "26": 10.55932, - "27": 10.54239, - "28": 10.51128, + "10": 10.82473, + "11": 10.85616, + "12": 10.85369, + "13": 10.86791, + "14": 10.87114, + "15": 10.82234, + "16": 10.79991, + "17": 10.7743, + "18": 10.78346, + "19": 10.79307, + "20": 10.68222, + "21": 10.64709, + "22": 10.50919, + "23": 10.6683, + "24": 10.54196, + "25": 10.49283, + "26": 10.55931, + "27": 10.5424, + "28": 10.51132, "29": 10.53257, - "30": 10.28989, - "31": 10.02853, - "32": 10.3888, - "33": 10.39592, + "30": 10.28992, + "31": 10.02852, + "32": 10.38881, + "33": 10.39597, "34": 10.13449, - "35": 10.18931, - "36": 10.13352, - "37": 10.27378, - "38": 10.1075, + "35": 10.18929, + "36": 10.13354, + "37": 10.27384, + "38": 10.10753, "39": 10.34011, - "40": 9.98542, - "41": 10.06415, - "42": 10.1375, - "43": 9.73383, - "44": 9.86311, - "45": 9.73726, - "46": 9.71341, - "47": 10.07757, - "48": 9.76762, - "49": 9.4199, - "50": 9.81687 + "40": 9.98544, + "41": 10.06413, + "42": 10.13747, + "43": 9.73382, + "44": 9.86306, + "45": 9.73725, + "46": 9.71343, + "47": 10.07755, + "48": 9.76765, + "49": 9.41988, + "50": 9.81692 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 567.0, - "2": 609.0, - "3": 638.0, - "4": 657.0, - "5": 654.0, - "6": 637.0, - "7": 614.0, - "8": 599.0, - "9": 637.0, - "10": 517.0, - "11": 673.0, - "12": 640.0, - "13": 685.0, - "14": 609.0, - "15": 596.0, - "16": 653.0, - "17": 590.0, - "18": 559.0, - "19": 675.0, - "20": 598.0, - "21": 699.0, - "22": 631.0, - "23": 650.0, - "24": 625.0, - "25": 591.0, - "26": 627.0, - "27": 684.0, - "28": 679.0, - "29": 748.0, - "30": 703.0, - "31": 626.0, - "32": 724.0, - "33": 753.0, - "34": 658.0, - "35": 727.0, - "36": 730.0, - "37": 861.0, - "38": 778.0, - "39": 899.0, - "40": 845.0, - "41": 770.0, - "42": 819.0, - "43": 716.0, - "44": 793.0, - "45": 770.0, - "46": 849.0, - "47": 900.0, - "48": 873.0, - "49": 852.0, - "50": 888.0 + "1": 593.0, + "2": 604.0, + "3": 621.0, + "4": 634.0, + "5": 658.0, + "6": 640.0, + "7": 624.0, + "8": 576.0, + "9": 622.0, + "10": 481.0, + "11": 703.0, + "12": 606.0, + "13": 667.0, + "14": 652.0, + "15": 654.0, + "16": 625.0, + "17": 598.0, + "18": 534.0, + "19": 627.0, + "20": 616.0, + "21": 720.0, + "22": 601.0, + "23": 647.0, + "24": 615.0, + "25": 577.0, + "26": 654.0, + "27": 661.0, + "28": 705.0, + "29": 681.0, + "30": 725.0, + "31": 613.0, + "32": 766.0, + "33": 801.0, + "34": 690.0, + "35": 697.0, + "36": 733.0, + "37": 839.0, + "38": 806.0, + "39": 841.0, + "40": 858.0, + "41": 837.0, + "42": 812.0, + "43": 696.0, + "44": 819.0, + "45": 753.0, + "46": 840.0, + "47": 921.0, + "48": 863.0, + "49": 850.0, + "50": 830.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 461144576.0, - "2": 461144576.0, - "3": 461144576.0, - "4": 461144576.0, - "5": 461144576.0, - "6": 461144576.0, - "7": 461144576.0, - "8": 461144576.0, - "9": 461144576.0, - "10": 461144576.0, - "11": 461144576.0, - "12": 461144576.0, - "13": 461144576.0, - "14": 461144576.0, - "15": 461144576.0, - "16": 461144576.0, - "17": 461144576.0, - "18": 461144576.0, - "19": 461144576.0, - "20": 461144576.0, - "21": 461144576.0, - "22": 461144576.0, - "23": 461144576.0, - "24": 461144576.0, - "25": 461144576.0, - "26": 461144576.0, - "27": 461144576.0, - "28": 461144576.0, - "29": 461144576.0, - "30": 461144576.0, - "31": 461144576.0, - "32": 461144576.0, - "33": 461144576.0, - "34": 461144576.0, - "35": 461144576.0, - "36": 461144576.0, - "37": 461144576.0, - "38": 461144576.0, - "39": 461144576.0, - "40": 461144576.0, - "41": 461144576.0, - "42": 461144576.0, - "43": 461144576.0, - "44": 461144576.0, - "45": 461144576.0, - "46": 461144576.0, - "47": 461144576.0, - "48": 461144576.0, - "49": 461144576.0, - "50": 461144576.0 + "1": 460882432.0, + "2": 460882432.0, + "3": 460882432.0, + "4": 460882432.0, + "5": 460882432.0, + "6": 460882432.0, + "7": 460882432.0, + "8": 460882432.0, + "9": 460882432.0, + "10": 460882432.0, + "11": 460882432.0, + "12": 460882432.0, + "13": 460882432.0, + "14": 460882432.0, + "15": 460882432.0, + "16": 460882432.0, + "17": 460882432.0, + "18": 460882432.0, + "19": 460882432.0, + "20": 460882432.0, + "21": 460882432.0, + "22": 460882432.0, + "23": 460882432.0, + "24": 460882432.0, + "25": 460882432.0, + "26": 460882432.0, + "27": 460882432.0, + "28": 460882432.0, + "29": 460882432.0, + "30": 460882432.0, + "31": 460882432.0, + "32": 460882432.0, + "33": 460882432.0, + "34": 460882432.0, + "35": 460882432.0, + "36": 460882432.0, + "37": 460882432.0, + "38": 460882432.0, + "39": 460882432.0, + "40": 460882432.0, + "41": 460882432.0, + "42": 460882432.0, + "43": 460882432.0, + "44": 460882432.0, + "45": 460882432.0, + "46": 460882432.0, + "47": 460882432.0, + "48": 460882432.0, + "49": 460882432.0, + "50": 460882432.0 } }, "mem-max-allocated-bytes": { @@ -176,55 +176,55 @@ "step_interval": 1, "values": { "1": 705635840.0, - "2": 884659712.0, - "3": 885183488.0, - "4": 885183488.0, - "5": 885707264.0, - "6": 885707264.0, - "7": 885707264.0, - "8": 885707264.0, - "9": 885707264.0, - "10": 885707264.0, - "11": 885707264.0, - "12": 885707264.0, - "13": 885707264.0, - "14": 885707264.0, - "15": 885707264.0, - "16": 885707264.0, - "17": 885707264.0, - "18": 885707264.0, - "19": 885707264.0, - "20": 885707264.0, - "21": 885707264.0, - "22": 885707264.0, - "23": 885707264.0, - "24": 885707264.0, - "25": 885707264.0, - "26": 885707264.0, - "27": 885707264.0, - "28": 885707264.0, - "29": 885707264.0, - "30": 885708288.0, - "31": 885708288.0, - "32": 885708288.0, - "33": 885708288.0, - "34": 885708288.0, - "35": 885708288.0, - "36": 885708288.0, - "37": 885708288.0, - "38": 885708288.0, - "39": 885708288.0, - "40": 885708288.0, - "41": 885708288.0, - "42": 885708288.0, - "43": 885708288.0, - "44": 885708288.0, - "45": 885708288.0, - "46": 885708288.0, - "47": 885708288.0, - "48": 885708288.0, - "49": 885708288.0, - "50": 885708288.0 + "2": 883348992.0, + "3": 883348992.0, + "4": 883348992.0, + "5": 883348992.0, + "6": 883348992.0, + "7": 883348992.0, + "8": 883348992.0, + "9": 883348992.0, + "10": 883348992.0, + "11": 883348992.0, + "12": 883348992.0, + "13": 883348992.0, + "14": 883348992.0, + "15": 883348992.0, + "16": 883348992.0, + "17": 883348992.0, + "18": 883348992.0, + "19": 883348992.0, + "20": 885443584.0, + "21": 885445120.0, + "22": 885445120.0, + "23": 885445120.0, + "24": 885445120.0, + "25": 885445120.0, + "26": 885445120.0, + "27": 885445120.0, + "28": 885445120.0, + "29": 885445120.0, + "30": 885445120.0, + "31": 885446144.0, + "32": 885446144.0, + "33": 885446144.0, + "34": 885446144.0, + "35": 885446144.0, + "36": 885446144.0, + "37": 885446144.0, + "38": 885446144.0, + "39": 885446144.0, + "40": 885446144.0, + "41": 885446144.0, + "42": 885446144.0, + "43": 885446144.0, + "44": 885446144.0, + "45": 885446144.0, + "46": 885446144.0, + "47": 885446144.0, + "48": 885446144.0, + "49": 885446144.0, + "50": 885446144.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.21093, - "2": 0.56501, - "3": 0.71491, - "4": 0.53313, - "5": 0.43082, - "6": 0.4637, - "7": 0.40802, - "8": 0.46193, - "9": 0.40155, - "10": 0.40252, - "11": 0.52711, - "12": 0.4035, - "13": 0.40765, - "14": 0.40187, - "15": 0.40322, - "16": 0.40497, - "17": 0.40698, - "18": 0.40153, - "19": 0.46487, - "20": 0.40131, - "21": 0.4044, - "22": 0.40166, - "23": 0.40177, - "24": 0.40507, - "25": 0.405, - "26": 0.40144, - "27": 0.40453, - "28": 0.40108, - "29": 0.4052, - "30": 0.40603, - "31": 0.40719, - "32": 0.40638, - "33": 0.40514, - "34": 0.44714, - "35": 0.40534, - "36": 0.40221, - "37": 0.3984, - "38": 0.40367, - "39": 0.40221, - "40": 0.43747, - "41": 0.40384, - "42": 0.40404, - "43": 0.40132, - "44": 0.40047, - "45": 0.40017, - "46": 0.40235, - "47": 0.39964, - "48": 0.39919, - "49": 0.40337, - "50": 0.48503 + "1": "nan", + "2": 3.8465, + "3": 0.43585, + "4": 0.41053, + "5": 0.40923, + "6": 0.40815, + "7": 0.41221, + "8": 0.41419, + "9": 0.41058, + "10": 0.40643, + "11": 0.40558, + "12": 0.40602, + "13": 0.4063, + "14": 0.40517, + "15": 0.40811, + "16": 0.40555, + "17": 0.40549, + "18": 0.40655, + "19": 0.40455, + "20": 0.40661, + "21": 0.40594, + "22": 0.406, + "23": 0.40409, + "24": 0.40593, + "25": 0.40476, + "26": 0.40466, + "27": 0.40486, + "28": 0.40491, + "29": 0.41169, + "30": 0.40768, + "31": 0.40772, + "32": 0.40874, + "33": 0.40861, + "34": 0.40706, + "35": 0.40837, + "36": 0.40765, + "37": 0.40963, + "38": 0.40873, + "39": 0.40653, + "40": 0.4068, + "41": 0.40742, + "42": 0.40739, + "43": 0.43116, + "44": 0.40318, + "45": 0.41555, + "46": 0.40362, + "47": 0.40203, + "48": 0.40164, + "49": 0.40396, + "50": 0.40607 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json index aa1e18f88cb..724f3a7c2b8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 460882432.0, + "2": 460882432.0, + "3": 460882432.0, + "4": 460882432.0, + "5": 460882432.0, + "6": 460882432.0, + "7": 460882432.0, + "8": 460882432.0, + "9": 460882432.0, + "10": 460882432.0, + "11": 460882432.0, + "12": 460882432.0, + "13": 460882432.0, + "14": 460882432.0, + "15": 460882432.0, + "16": 460882432.0, + "17": 460882432.0, + "18": 460882432.0, + "19": 460882432.0, + "20": 460882432.0, + "21": 460882432.0, + "22": 460882432.0, + "23": 460882432.0, + "24": 460882432.0, + "25": 460882432.0, + "26": 460882432.0, + "27": 460882432.0, + "28": 460882432.0, + "29": 460882432.0, + "30": 460882432.0, + "31": 460882432.0, + "32": 460882432.0, + "33": 460882432.0, + "34": 460882432.0, + "35": 460882432.0, + "36": 460882432.0, + "37": 460882432.0, + "38": 460882432.0, + "39": 460882432.0, + "40": 460882432.0, + "41": 460882432.0, + "42": 460882432.0, + "43": 460882432.0, + "44": 460882432.0, + "45": 460882432.0, + "46": 460882432.0, + "47": 460882432.0, + "48": 460882432.0, + "49": 460882432.0, + "50": 460882432.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 708781568.0, - "2": 885184000.0, - "3": 885184000.0, - "4": 885184000.0, - "5": 885184000.0, - "6": 885184000.0, - "7": 885184000.0, - "8": 885184000.0, - "9": 885184000.0, - "10": 885184000.0, - "11": 885184000.0, - "12": 885184000.0, - "13": 885184000.0, - "14": 885184000.0, - "15": 885184000.0, - "16": 885184000.0, - "17": 885184000.0, - "18": 885184000.0, - "19": 885184000.0, - "20": 885184000.0, - "21": 886231552.0, - "22": 886231552.0, - "23": 886231552.0, - "24": 886231552.0, - "25": 886231552.0, - "26": 886231552.0, - "27": 886231552.0, - "28": 886231552.0, - "29": 886231552.0, - "30": 886231552.0, - "31": 886231552.0, - "32": 886231552.0, - "33": 886231552.0, - "34": 886231552.0, - "35": 886231552.0, - "36": 886231552.0, - "37": 886231552.0, - "38": 886231552.0, - "39": 886231552.0, - "40": 886231552.0, - "41": 886231552.0, - "42": 886231552.0, - "43": 886231552.0, - "44": 886231552.0, - "45": 886231552.0, - "46": 886231552.0, - "47": 886231552.0, - "48": 886231552.0, - "49": 886231552.0, - "50": 886231552.0 + "1": 706684416.0, + "2": 885445632.0, + "3": 885445632.0, + "4": 885445632.0, + "5": 885445632.0, + "6": 885445632.0, + "7": 885445632.0, + "8": 885445632.0, + "9": 885445632.0, + "10": 885445632.0, + "11": 885445632.0, + "12": 885445632.0, + "13": 885445632.0, + "14": 885445632.0, + "15": 885445632.0, + "16": 885445632.0, + "17": 885445632.0, + "18": 885445632.0, + "19": 885445632.0, + "20": 885445632.0, + "21": 885445632.0, + "22": 885445632.0, + "23": 885445632.0, + "24": 885445632.0, + "25": 885445632.0, + "26": 885445632.0, + "27": 885445632.0, + "28": 885445632.0, + "29": 885445632.0, + "30": 885446144.0, + "31": 885446144.0, + "32": 885446144.0, + "33": 885446144.0, + "34": 885446144.0, + "35": 885446144.0, + "36": 885446144.0, + "37": 885446144.0, + "38": 885446144.0, + "39": 885446144.0, + "40": 885446144.0, + "41": 885446144.0, + "42": 885446144.0, + "43": 885446144.0, + "44": 885446144.0, + "45": 885446144.0, + "46": 885446144.0, + "47": 885446144.0, + "48": 885446144.0, + "49": 885446144.0, + "50": 885446144.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.83536, - "2": 0.50436, - "3": 0.49153, - "4": 0.51839, - "5": 0.41963, - "6": 0.42593, - "7": 0.50539, - "8": 0.43728, - "9": 0.43214, - "10": 0.43276, - "11": 0.43243, - "12": 0.64271, - "13": 0.48613, - "14": 0.47822, - "15": 0.4732, - "16": 0.43011, - "17": 0.49091, - "18": 0.4264, - "19": 0.46987, - "20": 0.48787, - "21": 0.48533, - "22": 0.49433, - "23": 0.42402, - "24": 0.45662, - "25": 0.48851, - "26": 0.55798, - "27": 0.49442, - "28": 0.46841, - "29": 0.45193, - "30": 0.42664, - "31": 0.47172, - "32": 0.42125, - "33": 0.42866, - "34": 0.47761, - "35": 0.42624, - "36": 0.45512, - "37": 0.42405, - "38": 0.45455, - "39": 0.42258, - "40": 0.42354, - "41": 0.42486, - "42": 0.42783, - "43": 0.47508, - "44": 0.42611, - "45": 0.4236, - "46": 0.42862, - "47": 0.42603, - "48": 0.6007, - "49": 0.42833, - "50": 0.42517 + "1": "nan", + "2": 4.33644, + "3": 0.43367, + "4": 0.41796, + "5": 0.42437, + "6": 0.41803, + "7": 0.42777, + "8": 0.49261, + "9": 0.5259, + "10": 0.5165, + "11": 0.50964, + "12": 0.51307, + "13": 0.5209, + "14": 0.52184, + "15": 0.51919, + "16": 0.52047, + "17": 0.52096, + "18": 0.51364, + "19": 0.52433, + "20": 0.5561, + "21": 0.6836, + "22": 0.64032, + "23": 0.47706, + "24": 0.46456, + "25": 0.46618, + "26": 0.46113, + "27": 0.46089, + "28": 0.45645, + "29": 0.44502, + "30": 0.44476, + "31": 0.44646, + "32": 0.44355, + "33": 0.44206, + "34": 0.4428, + "35": 0.43615, + "36": 0.43343, + "37": 0.4428, + "38": 0.43948, + "39": 0.42992, + "40": 0.44781, + "41": 0.44585, + "42": 0.43409, + "43": 0.42263, + "44": 0.41737, + "45": 0.41789, + "46": 0.41449, + "47": 0.41442, + "48": 0.41697, + "49": 0.41486, + "50": 0.41305 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json index 8858c8ab59e..bb38c983224 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json @@ -6,54 +6,54 @@ "values": { "1": 10.86244, "2": 10.88582, - "3": 10.84735, - "4": 10.85571, - "5": 10.86001, - "6": 10.87728, + "3": 10.84732, + "4": 10.85572, + "5": 10.86003, + "6": 10.87729, "7": 10.86557, "8": 10.84912, - "9": 10.86609, - "10": 10.82474, - "11": 10.8562, - "12": 10.85373, - "13": 10.86791, - "14": 10.87118, - "15": 10.82233, - "16": 10.79992, - "17": 10.77429, - "18": 10.78345, - "19": 10.79312, - "20": 10.68225, - "21": 10.64714, - "22": 10.50918, + "9": 10.86604, + "10": 10.82473, + "11": 10.85617, + "12": 10.85371, + "13": 10.86788, + "14": 10.87113, + "15": 10.82235, + "16": 10.79993, + "17": 10.77433, + "18": 10.78348, + "19": 10.79308, + "20": 10.68227, + "21": 10.6471, + "22": 10.50922, "23": 10.66831, - "24": 10.54193, + "24": 10.54194, "25": 10.49281, - "26": 10.5593, - "27": 10.54238, - "28": 10.51129, - "29": 10.53257, - "30": 10.28987, - "31": 10.02852, - "32": 10.38878, - "33": 10.39598, - "34": 10.13455, - "35": 10.18928, - "36": 10.13354, - "37": 10.2738, - "38": 10.1075, - "39": 10.34012, - "40": 9.9854, - "41": 10.06415, + "26": 10.55932, + "27": 10.54243, + "28": 10.51131, + "29": 10.53254, + "30": 10.28988, + "31": 10.02851, + "32": 10.3888, + "33": 10.39597, + "34": 10.13451, + "35": 10.18926, + "36": 10.13351, + "37": 10.27379, + "38": 10.10746, + "39": 10.34007, + "40": 9.98541, + "41": 10.06416, "42": 10.13748, - "43": 9.73384, - "44": 9.86308, - "45": 9.73722, + "43": 9.73386, + "44": 9.86309, + "45": 9.73718, "46": 9.71345, - "47": 10.07752, - "48": 9.76768, - "49": 9.4199, - "50": 9.81691 + "47": 10.07751, + "48": 9.76767, + "49": 9.41988, + "50": 9.81692 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 575.0, - "2": 587.0, - "3": 615.0, - "4": 627.0, - "5": 639.0, - "6": 629.0, - "7": 625.0, - "8": 589.0, - "9": 645.0, - "10": 515.0, - "11": 616.0, - "12": 569.0, - "13": 701.0, - "14": 633.0, - "15": 589.0, - "16": 615.0, - "17": 612.0, - "18": 575.0, - "19": 549.0, - "20": 615.0, - "21": 693.0, - "22": 611.0, - "23": 737.0, - "24": 689.0, - "25": 579.0, - "26": 557.0, - "27": 692.0, - "28": 719.0, - "29": 762.0, - "30": 730.0, - "31": 579.0, + "1": 568.0, + "2": 566.0, + "3": 625.0, + "4": 604.0, + "5": 668.0, + "6": 650.0, + "7": 602.0, + "8": 647.0, + "9": 632.0, + "10": 527.0, + "11": 641.0, + "12": 661.0, + "13": 666.0, + "14": 656.0, + "15": 642.0, + "16": 603.0, + "17": 653.0, + "18": 570.0, + "19": 674.0, + "20": 571.0, + "21": 709.0, + "22": 663.0, + "23": 704.0, + "24": 628.0, + "25": 568.0, + "26": 632.0, + "27": 668.0, + "28": 736.0, + "29": 760.0, + "30": 687.0, + "31": 589.0, "32": 740.0, - "33": 766.0, - "34": 683.0, - "35": 705.0, - "36": 709.0, - "37": 810.0, - "38": 771.0, - "39": 872.0, - "40": 846.0, - "41": 757.0, - "42": 789.0, - "43": 766.0, - "44": 833.0, - "45": 738.0, - "46": 870.0, - "47": 891.0, - "48": 874.0, - "49": 857.0, - "50": 875.0 + "33": 772.0, + "34": 713.0, + "35": 753.0, + "36": 731.0, + "37": 873.0, + "38": 762.0, + "39": 836.0, + "40": 864.0, + "41": 780.0, + "42": 847.0, + "43": 740.0, + "44": 822.0, + "45": 718.0, + "46": 826.0, + "47": 890.0, + "48": 852.0, + "49": 872.0, + "50": 869.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 460882432.0, + "2": 460882432.0, + "3": 460882432.0, + "4": 460882432.0, + "5": 460882432.0, + "6": 460882432.0, + "7": 460882432.0, + "8": 460882432.0, + "9": 460882432.0, + "10": 460882432.0, + "11": 460882432.0, + "12": 460882432.0, + "13": 460882432.0, + "14": 460882432.0, + "15": 460882432.0, + "16": 460882432.0, + "17": 460882432.0, + "18": 460882432.0, + "19": 460882432.0, + "20": 460882432.0, + "21": 460882432.0, + "22": 460882432.0, + "23": 460882432.0, + "24": 460882432.0, + "25": 460882432.0, + "26": 460882432.0, + "27": 460882432.0, + "28": 460882432.0, + "29": 460882432.0, + "30": 460882432.0, + "31": 460882432.0, + "32": 460882432.0, + "33": 460882432.0, + "34": 460882432.0, + "35": 460882432.0, + "36": 460882432.0, + "37": 460882432.0, + "38": 460882432.0, + "39": 460882432.0, + "40": 460882432.0, + "41": 460882432.0, + "42": 460882432.0, + "43": 460882432.0, + "44": 460882432.0, + "45": 460882432.0, + "46": 460882432.0, + "47": 460882432.0, + "48": 460882432.0, + "49": 460882432.0, + "50": 460882432.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 704587264.0, - "2": 883611136.0, - "3": 884135424.0, - "4": 884658176.0, - "5": 884658176.0, - "6": 884658176.0, - "7": 884658176.0, - "8": 884658176.0, - "9": 884658688.0, - "10": 884659200.0, - "11": 884659200.0, - "12": 884659200.0, - "13": 884659200.0, - "14": 884659200.0, - "15": 884659200.0, - "16": 884659712.0, - "17": 884659712.0, - "18": 884659712.0, - "19": 884659712.0, - "20": 884659712.0, - "21": 884659712.0, - "22": 884659712.0, - "23": 884659712.0, - "24": 884659712.0, - "25": 884659712.0, - "26": 884659712.0, - "27": 884659712.0, - "28": 884659712.0, - "29": 884659712.0, - "30": 884659712.0, - "31": 884659712.0, - "32": 884659712.0, - "33": 884659712.0, - "34": 884659712.0, - "35": 884659712.0, - "36": 884659712.0, - "37": 884659712.0, - "38": 884659712.0, - "39": 884659712.0, - "40": 884659712.0, - "41": 884659712.0, - "42": 884659712.0, - "43": 884659712.0, - "44": 884659712.0, - "45": 884659712.0, - "46": 884659712.0, - "47": 884659712.0, - "48": 884659712.0, - "49": 884659712.0, - "50": 884659712.0 + "1": 705634816.0, + "2": 884397568.0, + "3": 885443584.0, + "4": 885445632.0, + "5": 885445632.0, + "6": 885445632.0, + "7": 885445632.0, + "8": 885445632.0, + "9": 885445632.0, + "10": 885445632.0, + "11": 885445632.0, + "12": 885445632.0, + "13": 885445632.0, + "14": 885445632.0, + "15": 885446144.0, + "16": 885446144.0, + "17": 885446144.0, + "18": 885446144.0, + "19": 885446144.0, + "20": 885446144.0, + "21": 885446144.0, + "22": 885446144.0, + "23": 885446144.0, + "24": 885446144.0, + "25": 885446144.0, + "26": 885446144.0, + "27": 885446144.0, + "28": 885446144.0, + "29": 885446144.0, + "30": 885446144.0, + "31": 885446144.0, + "32": 885446144.0, + "33": 885446144.0, + "34": 885446144.0, + "35": 885446144.0, + "36": 885446144.0, + "37": 885446144.0, + "38": 885446144.0, + "39": 885446144.0, + "40": 885446144.0, + "41": 885446144.0, + "42": 885446144.0, + "43": 885446144.0, + "44": 885446144.0, + "45": 885446144.0, + "46": 885446144.0, + "47": 885446144.0, + "48": 885446144.0, + "49": 885446144.0, + "50": 885446144.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.6955, - "2": 0.4755, - "3": 0.50907, - "4": 0.43067, - "5": 0.73714, - "6": 0.4269, - "7": 0.42684, - "8": 0.64221, - "9": 0.48428, - "10": 0.42395, - "11": 0.42943, - "12": 0.49106, - "13": 0.49108, - "14": 0.67522, - "15": 0.42547, - "16": 0.41999, - "17": 0.46662, - "18": 0.45683, - "19": 0.41987, - "20": 0.41746, - "21": 0.41909, - "22": 0.4703, - "23": 0.42675, - "24": 0.62571, - "25": 0.47889, - "26": 0.53722, - "27": 0.49475, - "28": 0.48715, - "29": 0.59996, - "30": 0.4396, - "31": 0.42052, - "32": 0.4463, - "33": 0.45305, - "34": 0.45764, - "35": 0.42178, - "36": 0.4257, - "37": 0.43568, - "38": 0.42736, - "39": 0.42942, - "40": 0.43094, - "41": 0.42609, - "42": 0.42743, - "43": 0.43464, - "44": 0.43647, - "45": 0.46437, - "46": 0.46709, - "47": 0.64826, - "48": 0.44677, - "49": 0.64353, - "50": 0.4369 + "1": "nan", + "2": 4.20529, + "3": 0.43211, + "4": 0.41974, + "5": 0.41788, + "6": 0.41713, + "7": 0.41839, + "8": 0.41778, + "9": 0.41756, + "10": 0.4154, + "11": 0.41682, + "12": 0.41539, + "13": 0.41618, + "14": 0.41668, + "15": 0.41894, + "16": 0.41891, + "17": 0.46526, + "18": 0.46951, + "19": 0.48697, + "20": 0.51157, + "21": 0.44025, + "22": 0.41388, + "23": 0.41164, + "24": 0.41655, + "25": 0.41424, + "26": 0.41687, + "27": 0.41162, + "28": 0.41035, + "29": 0.41184, + "30": 0.40989, + "31": 0.41153, + "32": 0.41143, + "33": 0.41324, + "34": 0.41271, + "35": 0.41107, + "36": 0.41053, + "37": 0.41372, + "38": 0.4128, + "39": 0.41377, + "40": 0.41093, + "41": 0.41375, + "42": 0.40814, + "43": 0.40704, + "44": 0.40632, + "45": 0.41014, + "46": 0.41007, + "47": 0.41057, + "48": 0.41002, + "49": 0.4095, + "50": 0.41018 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json index 746c6b2ba10..4e8e9932015 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 460882432.0, + "2": 460882432.0, + "3": 460882432.0, + "4": 460882432.0, + "5": 460882432.0, + "6": 460882432.0, + "7": 460882432.0, + "8": 460882432.0, + "9": 460882432.0, + "10": 460882432.0, + "11": 460882432.0, + "12": 460882432.0, + "13": 460882432.0, + "14": 460882432.0, + "15": 460882432.0, + "16": 460882432.0, + "17": 460882432.0, + "18": 460882432.0, + "19": 460882432.0, + "20": 460882432.0, + "21": 460882432.0, + "22": 460882432.0, + "23": 460882432.0, + "24": 460882432.0, + "25": 460882432.0, + "26": 460882432.0, + "27": 460882432.0, + "28": 460882432.0, + "29": 460882432.0, + "30": 460882432.0, + "31": 460882432.0, + "32": 460882432.0, + "33": 460882432.0, + "34": 460882432.0, + "35": 460882432.0, + "36": 460882432.0, + "37": 460882432.0, + "38": 460882432.0, + "39": 460882432.0, + "40": 460882432.0, + "41": 460882432.0, + "42": 460882432.0, + "43": 460882432.0, + "44": 460882432.0, + "45": 460882432.0, + "46": 460882432.0, + "47": 460882432.0, + "48": 460882432.0, + "49": 460882432.0, + "50": 460882432.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 708781568.0, - "2": 885184000.0, - "3": 885184000.0, - "4": 885184000.0, - "5": 885184000.0, - "6": 885184000.0, - "7": 885184000.0, - "8": 885184000.0, - "9": 885184000.0, - "10": 885184000.0, - "11": 885184000.0, - "12": 885184000.0, - "13": 885184000.0, - "14": 885184000.0, - "15": 885184000.0, - "16": 885184000.0, - "17": 885184000.0, - "18": 885184000.0, - "19": 885184000.0, - "20": 885184000.0, - "21": 885184000.0, - "22": 885184000.0, - "23": 885184000.0, - "24": 885184000.0, - "25": 885184000.0, - "26": 885184000.0, - "27": 885184000.0, - "28": 885184000.0, - "29": 885184000.0, - "30": 885184000.0, - "31": 885184000.0, - "32": 885184000.0, - "33": 885184000.0, - "34": 885184000.0, - "35": 885184000.0, - "36": 885184000.0, - "37": 885184000.0, - "38": 885184000.0, - "39": 885184000.0, - "40": 885184000.0, - "41": 885184000.0, - "42": 885184000.0, - "43": 885184000.0, - "44": 885184000.0, - "45": 885184000.0, - "46": 885184000.0, - "47": 885184000.0, - "48": 885184000.0, - "49": 885184000.0, - "50": 885706752.0 + "1": 705635840.0, + "2": 883348992.0, + "3": 883348992.0, + "4": 883348992.0, + "5": 883348992.0, + "6": 883348992.0, + "7": 883348992.0, + "8": 883348992.0, + "9": 883348992.0, + "10": 883348992.0, + "11": 883348992.0, + "12": 883348992.0, + "13": 883348992.0, + "14": 883348992.0, + "15": 883348992.0, + "16": 883348992.0, + "17": 883348992.0, + "18": 883348992.0, + "19": 883348992.0, + "20": 883348992.0, + "21": 883348992.0, + "22": 883348992.0, + "23": 883348992.0, + "24": 883348992.0, + "25": 883348992.0, + "26": 883348992.0, + "27": 883348992.0, + "28": 883348992.0, + "29": 883348992.0, + "30": 883348992.0, + "31": 883348992.0, + "32": 883348992.0, + "33": 883348992.0, + "34": 883348992.0, + "35": 883348992.0, + "36": 883348992.0, + "37": 883348992.0, + "38": 883348992.0, + "39": 883348992.0, + "40": 883348992.0, + "41": 883348992.0, + "42": 883348992.0, + "43": 883348992.0, + "44": 883348992.0, + "45": 883348992.0, + "46": 883348992.0, + "47": 883348992.0, + "48": 883348992.0, + "49": 883348992.0, + "50": 883348992.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.83222, - "2": 0.46295, - "3": 0.52097, - "4": 0.42074, - "5": 0.72217, - "6": 0.70851, - "7": 0.41812, - "8": 0.41893, - "9": 0.47564, - "10": 0.48012, - "11": 0.41406, - "12": 0.43392, - "13": 0.67246, - "14": 0.41498, - "15": 0.47203, - "16": 0.46, - "17": 0.40996, - "18": 0.4104, - "19": 0.66865, - "20": 0.40782, - "21": 0.40774, - "22": 0.49273, - "23": 0.49254, - "24": 0.47511, - "25": 0.64062, - "26": 0.43231, - "27": 0.50003, - "28": 0.46605, - "29": 0.64224, - "30": 0.42576, - "31": 0.40898, - "32": 0.49354, - "33": 0.47014, - "34": 0.4075, - "35": 0.40863, - "36": 0.40508, - "37": 0.42937, - "38": 0.41009, - "39": 0.4116, - "40": 0.40987, - "41": 0.41014, - "42": 0.45949, - "43": 0.40849, - "44": 0.48462, - "45": 0.4567, - "46": 0.40779, - "47": 0.466, - "48": 0.41678, - "49": 0.40871, - "50": 0.41039 + "1": "nan", + "2": 4.21004, + "3": 0.48981, + "4": 0.47344, + "5": 0.47824, + "6": 0.47946, + "7": 0.48311, + "8": 0.4801, + "9": 0.48448, + "10": 0.48375, + "11": 0.48291, + "12": 0.48722, + "13": 0.48237, + "14": 0.48101, + "15": 0.48357, + "16": 0.48502, + "17": 0.48354, + "18": 0.48307, + "19": 0.48204, + "20": 0.48295, + "21": 0.48064, + "22": 0.48504, + "23": 0.48487, + "24": 0.48367, + "25": 0.48061, + "26": 0.48279, + "27": 0.48417, + "28": 0.48173, + "29": 0.48221, + "30": 0.48351, + "31": 0.48309, + "32": 0.48067, + "33": 0.48269, + "34": 0.48404, + "35": 0.48325, + "36": 0.48418, + "37": 0.48111, + "38": 0.4835, + "39": 0.48361, + "40": 0.48287, + "41": 0.4882, + "42": 0.48161, + "43": 0.48229, + "44": 0.48219, + "45": 0.48623, + "46": 0.48196, + "47": 0.48211, + "48": 0.48322, + "49": 0.4833, + "50": 0.48355 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json index cef90be5674..35348e75b0f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json @@ -6,54 +6,54 @@ "values": { "1": 10.86244, "2": 10.88582, - "3": 10.84731, - "4": 10.85576, - "5": 10.86004, - "6": 10.87726, - "7": 10.86557, - "8": 10.84915, + "3": 10.84734, + "4": 10.85574, + "5": 10.86001, + "6": 10.8773, + "7": 10.86559, + "8": 10.84909, "9": 10.86608, - "10": 10.82477, - "11": 10.85617, - "12": 10.85377, + "10": 10.82475, + "11": 10.85619, + "12": 10.85375, "13": 10.86788, - "14": 10.87113, - "15": 10.82238, - "16": 10.79992, - "17": 10.77432, - "18": 10.78346, - "19": 10.79308, - "20": 10.68227, - "21": 10.64715, - "22": 10.50914, - "23": 10.66831, - "24": 10.54198, - "25": 10.49277, - "26": 10.55935, - "27": 10.54235, + "14": 10.87116, + "15": 10.82232, + "16": 10.79995, + "17": 10.77433, + "18": 10.78345, + "19": 10.79307, + "20": 10.68225, + "21": 10.64714, + "22": 10.50916, + "23": 10.66829, + "24": 10.54192, + "25": 10.49279, + "26": 10.55934, + "27": 10.54241, "28": 10.51128, - "29": 10.53255, + "29": 10.53257, "30": 10.28988, - "31": 10.02851, - "32": 10.38874, - "33": 10.39594, - "34": 10.13449, - "35": 10.18929, - "36": 10.13352, - "37": 10.2738, + "31": 10.02847, + "32": 10.38882, + "33": 10.39596, + "34": 10.13452, + "35": 10.18931, + "36": 10.13354, + "37": 10.27379, "38": 10.10752, - "39": 10.3401, - "40": 9.98541, - "41": 10.06413, - "42": 10.13748, - "43": 9.73382, + "39": 10.34011, + "40": 9.98539, + "41": 10.06415, + "42": 10.13747, + "43": 9.73381, "44": 9.86306, "45": 9.73727, - "46": 9.7134, - "47": 10.07755, - "48": 9.76767, - "49": 9.4199, - "50": 9.81686 + "46": 9.71341, + "47": 10.07754, + "48": 9.76766, + "49": 9.41987, + "50": 9.81689 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 572.0, - "2": 631.0, - "3": 629.0, - "4": 629.0, - "5": 576.0, - "6": 654.0, - "7": 633.0, - "8": 620.0, - "9": 630.0, - "10": 541.0, - "11": 632.0, - "12": 603.0, - "13": 675.0, - "14": 617.0, - "15": 651.0, - "16": 622.0, - "17": 619.0, - "18": 628.0, - "19": 641.0, - "20": 610.0, - "21": 677.0, - "22": 572.0, - "23": 703.0, - "24": 624.0, - "25": 555.0, - "26": 601.0, - "27": 666.0, - "28": 749.0, - "29": 699.0, - "30": 756.0, - "31": 582.0, - "32": 733.0, + "1": 588.0, + "2": 591.0, + "3": 656.0, + "4": 602.0, + "5": 609.0, + "6": 600.0, + "7": 596.0, + "8": 640.0, + "9": 653.0, + "10": 535.0, + "11": 657.0, + "12": 620.0, + "13": 700.0, + "14": 630.0, + "15": 628.0, + "16": 590.0, + "17": 604.0, + "18": 566.0, + "19": 580.0, + "20": 561.0, + "21": 643.0, + "22": 622.0, + "23": 679.0, + "24": 611.0, + "25": 575.0, + "26": 628.0, + "27": 640.0, + "28": 727.0, + "29": 736.0, + "30": 729.0, + "31": 575.0, + "32": 726.0, "33": 773.0, - "34": 655.0, - "35": 710.0, - "36": 762.0, - "37": 863.0, - "38": 786.0, - "39": 846.0, - "40": 789.0, - "41": 795.0, - "42": 902.0, - "43": 758.0, - "44": 804.0, - "45": 751.0, - "46": 895.0, - "47": 815.0, - "48": 842.0, - "49": 851.0, - "50": 835.0 + "34": 634.0, + "35": 720.0, + "36": 690.0, + "37": 818.0, + "38": 730.0, + "39": 754.0, + "40": 809.0, + "41": 787.0, + "42": 849.0, + "43": 757.0, + "44": 861.0, + "45": 825.0, + "46": 881.0, + "47": 915.0, + "48": 846.0, + "49": 853.0, + "50": 816.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 463110656.0, + "2": 463110656.0, + "3": 463110656.0, + "4": 463110656.0, + "5": 463110656.0, + "6": 463110656.0, + "7": 463110656.0, + "8": 463110656.0, + "9": 463110656.0, + "10": 463110656.0, + "11": 463110656.0, + "12": 463110656.0, + "13": 463110656.0, + "14": 463110656.0, + "15": 463110656.0, + "16": 463110656.0, + "17": 463110656.0, + "18": 463110656.0, + "19": 463110656.0, + "20": 463110656.0, + "21": 463110656.0, + "22": 463110656.0, + "23": 463110656.0, + "24": 463110656.0, + "25": 463110656.0, + "26": 463110656.0, + "27": 463110656.0, + "28": 463110656.0, + "29": 463110656.0, + "30": 463110656.0, + "31": 463110656.0, + "32": 463110656.0, + "33": 463110656.0, + "34": 463110656.0, + "35": 463110656.0, + "36": 463110656.0, + "37": 463110656.0, + "38": 463110656.0, + "39": 463110656.0, + "40": 463110656.0, + "41": 463110656.0, + "42": 463110656.0, + "43": 463110656.0, + "44": 463110656.0, + "45": 463110656.0, + "46": 463110656.0, + "47": 463110656.0, + "48": 463110656.0, + "49": 463110656.0, + "50": 463110656.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 708781568.0, - "2": 885183488.0, - "3": 885184000.0, - "4": 885184000.0, - "5": 885184000.0, - "6": 885184000.0, - "7": 885184000.0, - "8": 885184000.0, - "9": 885184000.0, - "10": 885184000.0, - "11": 885184000.0, - "12": 885184000.0, - "13": 885184000.0, - "14": 885184000.0, - "15": 885184000.0, - "16": 885184000.0, - "17": 885184000.0, - "18": 885184000.0, - "19": 885184000.0, - "20": 885184000.0, - "21": 885184000.0, - "22": 885184000.0, - "23": 885184000.0, - "24": 885184000.0, - "25": 885184000.0, - "26": 886232576.0, - "27": 886232576.0, - "28": 886232576.0, - "29": 886232576.0, - "30": 886232576.0, - "31": 886232576.0, - "32": 886232576.0, - "33": 886232576.0, - "34": 886232576.0, - "35": 886232576.0, - "36": 886232576.0, - "37": 886232576.0, - "38": 886232576.0, - "39": 886232576.0, - "40": 886232576.0, - "41": 886232576.0, - "42": 886232576.0, - "43": 886232576.0, - "44": 886232576.0, - "45": 886232576.0, - "46": 886232576.0, - "47": 886232576.0, - "48": 886232576.0, - "49": 886232576.0, - "50": 886232576.0 + "1": 704587264.0, + "2": 887671808.0, + "3": 887671808.0, + "4": 887671808.0, + "5": 887672320.0, + "6": 887672320.0, + "7": 887672320.0, + "8": 887674368.0, + "9": 887674368.0, + "10": 887674368.0, + "11": 887674368.0, + "12": 887674368.0, + "13": 887674368.0, + "14": 887674368.0, + "15": 887674368.0, + "16": 887674368.0, + "17": 887674368.0, + "18": 887674368.0, + "19": 887674368.0, + "20": 887674368.0, + "21": 887674368.0, + "22": 887674368.0, + "23": 887674368.0, + "24": 887674368.0, + "25": 887674368.0, + "26": 887674368.0, + "27": 887674368.0, + "28": 887674368.0, + "29": 887674368.0, + "30": 887674368.0, + "31": 887674368.0, + "32": 887674368.0, + "33": 887674368.0, + "34": 887674368.0, + "35": 887674368.0, + "36": 887674368.0, + "37": 887674368.0, + "38": 887674368.0, + "39": 887674368.0, + "40": 887674368.0, + "41": 887674368.0, + "42": 887674368.0, + "43": 887674368.0, + "44": 887674368.0, + "45": 887674368.0, + "46": 887674368.0, + "47": 887674368.0, + "48": 887674368.0, + "49": 887674368.0, + "50": 887674368.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 14.90548, - "2": 0.59116, - "3": 0.51351, - "4": 0.5889, - "5": 0.44588, - "6": 0.48318, - "7": 0.40946, - "8": 0.41291, - "9": 0.4711, - "10": 0.46604, - "11": 0.41089, - "12": 0.48863, - "13": 0.50268, - "14": 0.46761, - "15": 0.4075, - "16": 0.43179, - "17": 0.40649, - "18": 0.46497, - "19": 0.40807, - "20": 0.40657, - "21": 0.4151, - "22": 0.47269, - "23": 0.61429, - "24": 0.46129, - "25": 0.40977, - "26": 0.40692, - "27": 0.40603, - "28": 0.77632, - "29": 0.40782, - "30": 0.40901, - "31": 0.40545, - "32": 0.47343, - "33": 0.40648, - "34": 0.40452, - "35": 0.40862, - "36": 0.40878, - "37": 0.40927, - "38": 0.4062, - "39": 0.40929, - "40": 0.40755, - "41": 0.4034, - "42": 0.40739, - "43": 0.5793, - "44": 0.42611, - "45": 0.46136, - "46": 0.40554, - "47": 0.45264, - "48": 0.45209, - "49": 0.40299, - "50": 0.40119 + "1": "nan", + "2": 4.32392, + "3": 0.4432, + "4": 0.41997, + "5": 0.42011, + "6": 0.41602, + "7": 0.41695, + "8": 0.42042, + "9": 0.41532, + "10": 0.41013, + "11": 0.40983, + "12": 0.41104, + "13": 0.41182, + "14": 0.41252, + "15": 0.42002, + "16": 0.47232, + "17": 0.49274, + "18": 0.49507, + "19": 0.49112, + "20": 0.48715, + "21": 0.48361, + "22": 0.48476, + "23": 0.4789, + "24": 0.47778, + "25": 0.4792, + "26": 0.48432, + "27": 0.48617, + "28": 0.48159, + "29": 0.48042, + "30": 0.47772, + "31": 0.47956, + "32": 0.47326, + "33": 0.4727, + "34": 0.47303, + "35": 0.46857, + "36": 0.46923, + "37": 0.46968, + "38": 0.4682, + "39": 0.45815, + "40": 0.45997, + "41": 0.45486, + "42": 0.45349, + "43": 0.44331, + "44": 0.44252, + "45": 0.44141, + "46": 0.44016, + "47": 0.43955, + "48": 0.43852, + "49": 0.43914, + "50": 0.43791 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json index 4b1a17aa98b..08b446921f5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json @@ -6,54 +6,54 @@ "values": { "1": 10.86244, "2": 10.88582, - "3": 10.84732, + "3": 10.84734, "4": 10.85571, - "5": 10.86004, + "5": 10.86005, "6": 10.87729, - "7": 10.8656, - "8": 10.84913, + "7": 10.86561, + "8": 10.84911, "9": 10.86607, - "10": 10.82475, - "11": 10.85616, - "12": 10.85374, - "13": 10.86787, - "14": 10.87114, - "15": 10.82231, + "10": 10.82477, + "11": 10.85618, + "12": 10.85372, + "13": 10.8679, + "14": 10.87115, + "15": 10.82234, "16": 10.79992, - "17": 10.77434, - "18": 10.7835, - "19": 10.79308, - "20": 10.68228, - "21": 10.64713, - "22": 10.50916, - "23": 10.66826, - "24": 10.54197, - "25": 10.49279, - "26": 10.55934, - "27": 10.54238, - "28": 10.51131, - "29": 10.53257, - "30": 10.28989, - "31": 10.0285, - "32": 10.38879, - "33": 10.39594, - "34": 10.13454, - "35": 10.18927, - "36": 10.13356, - "37": 10.27378, - "38": 10.10748, - "39": 10.34013, - "40": 9.98543, - "41": 10.06417, - "42": 10.1375, + "17": 10.7743, + "18": 10.78345, + "19": 10.79307, + "20": 10.68226, + "21": 10.6471, + "22": 10.50919, + "23": 10.66829, + "24": 10.54194, + "25": 10.49284, + "26": 10.55935, + "27": 10.54236, + "28": 10.51129, + "29": 10.53259, + "30": 10.28991, + "31": 10.02854, + "32": 10.38881, + "33": 10.39595, + "34": 10.13448, + "35": 10.18931, + "36": 10.13349, + "37": 10.2738, + "38": 10.10751, + "39": 10.34007, + "40": 9.98538, + "41": 10.06415, + "42": 10.13746, "43": 9.73384, - "44": 9.86307, - "45": 9.7372, - "46": 9.71343, - "47": 10.07757, - "48": 9.76764, - "49": 9.41992, - "50": 9.81691 + "44": 9.86303, + "45": 9.73723, + "46": 9.71344, + "47": 10.07753, + "48": 9.76766, + "49": 9.4199, + "50": 9.8169 } }, "num-zeros": { @@ -61,55 +61,55 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 550.0, - "2": 584.0, - "3": 581.0, - "4": 611.0, - "5": 630.0, - "6": 629.0, - "7": 619.0, - "8": 582.0, - "9": 634.0, - "10": 525.0, - "11": 701.0, - "12": 622.0, - "13": 670.0, - "14": 615.0, - "15": 638.0, - "16": 596.0, - "17": 645.0, - "18": 555.0, - "19": 607.0, - "20": 560.0, - "21": 667.0, - "22": 599.0, - "23": 676.0, - "24": 660.0, - "25": 619.0, - "26": 595.0, - "27": 638.0, - "28": 707.0, - "29": 680.0, - "30": 693.0, - "31": 607.0, - "32": 698.0, - "33": 774.0, - "34": 696.0, - "35": 699.0, - "36": 674.0, - "37": 897.0, - "38": 818.0, - "39": 882.0, - "40": 873.0, - "41": 746.0, - "42": 836.0, - "43": 808.0, - "44": 829.0, - "45": 757.0, - "46": 877.0, - "47": 932.0, - "48": 892.0, - "49": 861.0, + "1": 561.0, + "2": 574.0, + "3": 615.0, + "4": 612.0, + "5": 664.0, + "6": 648.0, + "7": 593.0, + "8": 587.0, + "9": 622.0, + "10": 528.0, + "11": 652.0, + "12": 592.0, + "13": 640.0, + "14": 634.0, + "15": 646.0, + "16": 666.0, + "17": 596.0, + "18": 617.0, + "19": 635.0, + "20": 582.0, + "21": 698.0, + "22": 608.0, + "23": 643.0, + "24": 645.0, + "25": 584.0, + "26": 619.0, + "27": 669.0, + "28": 702.0, + "29": 714.0, + "30": 683.0, + "31": 604.0, + "32": 722.0, + "33": 758.0, + "34": 674.0, + "35": 705.0, + "36": 782.0, + "37": 828.0, + "38": 796.0, + "39": 884.0, + "40": 832.0, + "41": 821.0, + "42": 813.0, + "43": 749.0, + "44": 856.0, + "45": 792.0, + "46": 774.0, + "47": 914.0, + "48": 832.0, + "49": 821.0, "50": 871.0 } }, @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0 + "1": 460882432.0, + "2": 460882432.0, + "3": 460882432.0, + "4": 460882432.0, + "5": 460882432.0, + "6": 460882432.0, + "7": 460882432.0, + "8": 460882432.0, + "9": 460882432.0, + "10": 460882432.0, + "11": 460882432.0, + "12": 460882432.0, + "13": 460882432.0, + "14": 460882432.0, + "15": 460882432.0, + "16": 460882432.0, + "17": 460882432.0, + "18": 460882432.0, + "19": 460882432.0, + "20": 460882432.0, + "21": 460882432.0, + "22": 460882432.0, + "23": 460882432.0, + "24": 460882432.0, + "25": 460882432.0, + "26": 460882432.0, + "27": 460882432.0, + "28": 460882432.0, + "29": 460882432.0, + "30": 460882432.0, + "31": 460882432.0, + "32": 460882432.0, + "33": 460882432.0, + "34": 460882432.0, + "35": 460882432.0, + "36": 460882432.0, + "37": 460882432.0, + "38": 460882432.0, + "39": 460882432.0, + "40": 460882432.0, + "41": 460882432.0, + "42": 460882432.0, + "43": 460882432.0, + "44": 460882432.0, + "45": 460882432.0, + "46": 460882432.0, + "47": 460882432.0, + "48": 460882432.0, + "49": 460882432.0, + "50": 460882432.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 708781568.0, - "2": 885184000.0, - "3": 885184000.0, - "4": 885184000.0, - "5": 885184000.0, - "6": 885184000.0, - "7": 885184000.0, - "8": 885184000.0, - "9": 885184000.0, - "10": 885184000.0, - "11": 885184000.0, - "12": 885184000.0, - "13": 885184000.0, - "14": 885184000.0, - "15": 885184000.0, - "16": 885184000.0, - "17": 885184000.0, - "18": 885184000.0, - "19": 885184000.0, - "20": 885184000.0, - "21": 885184000.0, - "22": 885184000.0, - "23": 886232064.0, - "24": 886232064.0, - "25": 886232064.0, - "26": 886232064.0, - "27": 886232064.0, - "28": 886232064.0, - "29": 886232064.0, - "30": 886232064.0, - "31": 886232064.0, - "32": 886232064.0, - "33": 886232064.0, - "34": 886232064.0, - "35": 886232064.0, - "36": 886232064.0, - "37": 886232064.0, - "38": 886232064.0, - "39": 886232064.0, - "40": 886232064.0, - "41": 886232064.0, - "42": 886232064.0, - "43": 886232064.0, - "44": 886232064.0, - "45": 886232064.0, - "46": 886232064.0, - "47": 886232064.0, - "48": 886232064.0, - "49": 886232064.0, - "50": 886232064.0 + "1": 705635328.0, + "2": 883348992.0, + "3": 884397568.0, + "4": 885445632.0, + "5": 885445632.0, + "6": 885445632.0, + "7": 885445632.0, + "8": 885445632.0, + "9": 885445632.0, + "10": 885445632.0, + "11": 885445632.0, + "12": 885445632.0, + "13": 885445632.0, + "14": 885445632.0, + "15": 885445632.0, + "16": 885445632.0, + "17": 885445632.0, + "18": 885445632.0, + "19": 885445632.0, + "20": 885445632.0, + "21": 885445632.0, + "22": 885445632.0, + "23": 885446144.0, + "24": 885446144.0, + "25": 885446144.0, + "26": 885446144.0, + "27": 885446144.0, + "28": 885446144.0, + "29": 885446144.0, + "30": 885446144.0, + "31": 885446144.0, + "32": 885446144.0, + "33": 885446144.0, + "34": 885446144.0, + "35": 885446144.0, + "36": 886492672.0, + "37": 886492672.0, + "38": 886492672.0, + "39": 886492672.0, + "40": 886492672.0, + "41": 886492672.0, + "42": 886492672.0, + "43": 886492672.0, + "44": 886492672.0, + "45": 886492672.0, + "46": 886492672.0, + "47": 886492672.0, + "48": 886492672.0, + "49": 886493696.0, + "50": 886493696.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 13.80388, - "2": 0.45981, - "3": 0.47688, - "4": 0.46506, - "5": 0.40776, - "6": 0.40391, - "7": 0.40648, - "8": 0.40522, - "9": 0.54467, - "10": 0.40469, - "11": 0.76012, - "12": 0.40772, - "13": 0.40474, - "14": 0.40399, - "15": 0.40126, - "16": 0.40258, - "17": 0.40163, - "18": 0.40308, - "19": 0.40205, - "20": 0.45775, - "21": 0.45253, - "22": 0.40222, - "23": 0.47993, - "24": 0.74746, - "25": 0.54096, - "26": 0.595, - "27": 0.42244, - "28": 0.45559, - "29": 0.40939, - "30": 0.40941, - "31": 0.40631, - "32": 0.40777, - "33": 0.40662, - "34": 0.45082, - "35": 0.40861, - "36": 0.40683, - "37": 0.40916, - "38": 0.40762, - "39": 0.40423, - "40": 0.41411, - "41": 0.40792, - "42": 0.40703, - "43": 0.40488, - "44": 0.40689, - "45": 0.40884, - "46": 0.40591, - "47": 0.40461, - "48": 0.50976, - "49": 0.4042, - "50": 0.40707 + "1": "nan", + "2": 3.84499, + "3": 0.50052, + "4": 0.45861, + "5": 0.44336, + "6": 0.44062, + "7": 0.43954, + "8": 0.44061, + "9": 0.44129, + "10": 0.44028, + "11": 0.44106, + "12": 0.4893, + "13": 0.49424, + "14": 0.49729, + "15": 0.4969, + "16": 0.49673, + "17": 0.49876, + "18": 0.4992, + "19": 0.49565, + "20": 0.48635, + "21": 0.46659, + "22": 0.45563, + "23": 0.44842, + "24": 0.4425, + "25": 0.44486, + "26": 0.43654, + "27": 0.43626, + "28": 0.43493, + "29": 0.43571, + "30": 0.43296, + "31": 0.4336, + "32": 0.43346, + "33": 0.45798, + "34": 0.47046, + "35": 0.47986, + "36": 0.48443, + "37": 0.48862, + "38": 0.48621, + "39": 0.48674, + "40": 0.48663, + "41": 0.48915, + "42": 0.48901, + "43": 0.4567, + "44": 0.46536, + "45": 0.47673, + "46": 0.48141, + "47": 0.48283, + "48": 0.4896, + "49": 0.48736, + "50": 0.50085 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json index 529bad10ded..7fa492bd7eb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json @@ -6,54 +6,54 @@ "values": { "1": 10.86836, "2": 10.88595, - "3": 10.8656, + "3": 10.86559, "4": 10.86891, "5": 10.87418, - "6": 10.89058, - "7": 10.87677, - "8": 10.86475, - "9": 10.88236, - "10": 10.84579, - "11": 10.87162, - "12": 10.87422, - "13": 10.88161, - "14": 10.88886, - "15": 10.83932, + "6": 10.89057, + "7": 10.87676, + "8": 10.86476, + "9": 10.88235, + "10": 10.84582, + "11": 10.87163, + "12": 10.87421, + "13": 10.8816, + "14": 10.88885, + "15": 10.83934, "16": 10.82496, - "17": 10.80144, - "18": 10.81234, - "19": 10.82153, - "20": 10.71934, - "21": 10.69091, - "22": 10.57427, - "23": 10.71091, + "17": 10.80146, + "18": 10.81236, + "19": 10.82152, + "20": 10.71935, + "21": 10.69086, + "22": 10.57422, + "23": 10.71096, "24": 10.59783, - "25": 10.55561, - "26": 10.61523, - "27": 10.60449, - "28": 10.56482, - "29": 10.58475, - "30": 10.3595, - "31": 10.12152, - "32": 10.45239, - "33": 10.45725, - "34": 10.21986, - "35": 10.26447, - "36": 10.21035, + "25": 10.55559, + "26": 10.61516, + "27": 10.60451, + "28": 10.56481, + "29": 10.58476, + "30": 10.35947, + "31": 10.12155, + "32": 10.45234, + "33": 10.45724, + "34": 10.21987, + "35": 10.26441, + "36": 10.21037, "37": 10.33955, - "38": 10.18013, - "39": 10.39593, - "40": 10.06628, - "41": 10.14163, - "42": 10.2085, - "43": 9.83126, - "44": 9.9486, - "45": 9.82846, - "46": 9.80461, - "47": 10.14231, - "48": 9.84461, - "49": 9.52191, - "50": 9.88605 + "38": 10.18012, + "39": 10.39592, + "40": 10.06635, + "41": 10.14165, + "42": 10.20849, + "43": 9.83127, + "44": 9.94857, + "45": 9.82845, + "46": 9.80455, + "47": 10.14227, + "48": 9.84463, + "49": 9.52192, + "50": 9.88604 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 1554.0, - "2": 1619.0, - "3": 1663.0, + "1": 1673.0, + "2": 1596.0, + "3": 1676.0, "4": 1672.0, - "5": 1771.0, - "6": 1739.0, - "7": 1866.0, - "8": 1590.0, - "9": 1819.0, - "10": 1394.0, - "11": 1786.0, - "12": 1643.0, - "13": 1829.0, - "14": 1672.0, - "15": 1827.0, - "16": 1771.0, - "17": 1797.0, - "18": 1632.0, - "19": 1667.0, - "20": 1670.0, - "21": 1843.0, - "22": 1620.0, - "23": 1889.0, - "24": 1513.0, - "25": 1473.0, - "26": 1619.0, - "27": 1768.0, - "28": 1976.0, - "29": 1898.0, - "30": 1858.0, - "31": 1565.0, - "32": 1890.0, - "33": 2166.0, - "34": 1679.0, - "35": 1825.0, - "36": 1909.0, - "37": 2341.0, - "38": 2029.0, - "39": 2294.0, - "40": 2015.0, - "41": 2181.0, - "42": 2211.0, - "43": 1907.0, - "44": 2140.0, - "45": 1936.0, - "46": 2341.0, - "47": 2472.0, - "48": 2272.0, - "49": 2234.0, - "50": 2457.0 + "5": 1818.0, + "6": 1740.0, + "7": 1845.0, + "8": 1651.0, + "9": 1820.0, + "10": 1351.0, + "11": 1811.0, + "12": 1655.0, + "13": 1748.0, + "14": 1719.0, + "15": 1801.0, + "16": 1829.0, + "17": 1828.0, + "18": 1545.0, + "19": 1727.0, + "20": 1654.0, + "21": 1874.0, + "22": 1567.0, + "23": 1955.0, + "24": 1609.0, + "25": 1474.0, + "26": 1750.0, + "27": 1682.0, + "28": 1927.0, + "29": 1949.0, + "30": 1837.0, + "31": 1606.0, + "32": 1849.0, + "33": 2085.0, + "34": 1799.0, + "35": 1933.0, + "36": 1928.0, + "37": 2325.0, + "38": 2099.0, + "39": 2424.0, + "40": 2112.0, + "41": 2240.0, + "42": 2181.0, + "43": 1934.0, + "44": 2042.0, + "45": 2041.0, + "46": 2183.0, + "47": 2424.0, + "48": 2250.0, + "49": 2208.0, + "50": 2425.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 465601024.0, - "2": 465601024.0, - "3": 465601024.0, - "4": 465601024.0, - "5": 465601024.0, - "6": 465601024.0, - "7": 465601024.0, - "8": 465601024.0, - "9": 465601024.0, - "10": 465601024.0, - "11": 465601024.0, - "12": 465601024.0, - "13": 465601024.0, - "14": 465601024.0, - "15": 465601024.0, - "16": 465601024.0, - "17": 465601024.0, - "18": 465601024.0, - "19": 465601024.0, - "20": 465601024.0, - "21": 465601024.0, - "22": 465601024.0, - "23": 465601024.0, - "24": 465601024.0, - "25": 465601024.0, - "26": 465601024.0, - "27": 465601024.0, - "28": 465601024.0, - "29": 465601024.0, - "30": 465601024.0, - "31": 465601024.0, - "32": 465601024.0, - "33": 465601024.0, - "34": 465601024.0, - "35": 465601024.0, - "36": 465601024.0, - "37": 465601024.0, - "38": 465601024.0, - "39": 465601024.0, - "40": 465601024.0, - "41": 465601024.0, - "42": 465601024.0, - "43": 465601024.0, - "44": 465601024.0, - "45": 465601024.0, - "46": 465601024.0, - "47": 465601024.0, - "48": 465601024.0, - "49": 465601024.0, - "50": 465601024.0 + "1": 465207808.0, + "2": 466256384.0, + "3": 466256384.0, + "4": 466256384.0, + "5": 466256384.0, + "6": 466256384.0, + "7": 466256384.0, + "8": 466256384.0, + "9": 466256384.0, + "10": 466256384.0, + "11": 466256384.0, + "12": 466256384.0, + "13": 466256384.0, + "14": 466256384.0, + "15": 466256384.0, + "16": 466256384.0, + "17": 466256384.0, + "18": 466256384.0, + "19": 466256384.0, + "20": 466256384.0, + "21": 466256384.0, + "22": 466256384.0, + "23": 466256384.0, + "24": 466256384.0, + "25": 466256384.0, + "26": 466256384.0, + "27": 466256384.0, + "28": 466256384.0, + "29": 466256384.0, + "30": 466256384.0, + "31": 466256384.0, + "32": 466256384.0, + "33": 466256384.0, + "34": 466256384.0, + "35": 466256384.0, + "36": 466256384.0, + "37": 466256384.0, + "38": 466256384.0, + "39": 466256384.0, + "40": 466256384.0, + "41": 466256384.0, + "42": 466256384.0, + "43": 466256384.0, + "44": 466256384.0, + "45": 466256384.0, + "46": 466256384.0, + "47": 466256384.0, + "48": 466256384.0, + "49": 466256384.0, + "50": 466256384.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 1728999424.0, - "2": 1789405696.0, - "3": 1789405696.0, - "4": 1789405696.0, - "5": 1789405696.0, - "6": 1789405696.0, - "7": 1789405696.0, - "8": 1789405696.0, - "9": 1789405696.0, - "10": 1789405696.0, - "11": 1789405696.0, - "12": 1789405696.0, - "13": 1789405696.0, - "14": 1789405696.0, - "15": 1789405696.0, - "16": 1789405696.0, - "17": 1789405696.0, - "18": 1789405696.0, - "19": 1789405696.0, - "20": 1789405696.0, - "21": 1789405696.0, - "22": 1789405696.0, - "23": 1789405696.0, - "24": 1789405696.0, - "25": 1789405696.0, - "26": 1789405696.0, - "27": 1789405696.0, - "28": 1789405696.0, - "29": 1789405696.0, - "30": 1789405696.0, - "31": 1789405696.0, - "32": 1789405696.0, - "33": 1789405696.0, - "34": 1789405696.0, - "35": 1789405696.0, - "36": 1789405696.0, - "37": 1789405696.0, - "38": 1789405696.0, - "39": 1789405696.0, - "40": 1789405696.0, - "41": 1789405696.0, - "42": 1789405696.0, - "43": 1789405696.0, - "44": 1789405696.0, - "45": 1789405696.0, - "46": 1789405696.0, - "47": 1789405696.0, - "48": 1789405696.0, - "49": 1789405696.0, - "50": 1789405696.0 + "1": 1608546816.0, + "2": 1789536768.0, + "3": 1789536768.0, + "4": 1789536768.0, + "5": 1789536768.0, + "6": 1789536768.0, + "7": 1789536768.0, + "8": 1789536768.0, + "9": 1789536768.0, + "10": 1789536768.0, + "11": 1789536768.0, + "12": 1789536768.0, + "13": 1789536768.0, + "14": 1789536768.0, + "15": 1789536768.0, + "16": 1789536768.0, + "17": 1789536768.0, + "18": 1789536768.0, + "19": 1789536768.0, + "20": 1789536768.0, + "21": 1789536768.0, + "22": 1789536768.0, + "23": 1789536768.0, + "24": 1789536768.0, + "25": 1789536768.0, + "26": 1789536768.0, + "27": 1789536768.0, + "28": 1789536768.0, + "29": 1789536768.0, + "30": 1789536768.0, + "31": 1789536768.0, + "32": 1789536768.0, + "33": 1789536768.0, + "34": 1789536768.0, + "35": 1789536768.0, + "36": 1789536768.0, + "37": 1789536768.0, + "38": 1789536768.0, + "39": 1789536768.0, + "40": 1789536768.0, + "41": 1789536768.0, + "42": 1789536768.0, + "43": 1789536768.0, + "44": 1789536768.0, + "45": 1789536768.0, + "46": 1789536768.0, + "47": 1789536768.0, + "48": 1789536768.0, + "49": 1789536768.0, + "50": 1789536768.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.87728, - "2": 0.19403, - "3": 0.17442, - "4": 0.16292, - "5": 0.28152, - "6": 0.1602, - "7": 0.20711, - "8": 0.4188, - "9": 0.1573, - "10": 0.25278, - "11": 0.44625, - "12": 0.23028, - "13": 0.16929, - "14": 0.15589, - "15": 0.24336, - "16": 0.19322, - "17": 0.19037, - "18": 0.15335, - "19": 0.25153, - "20": 0.20655, - "21": 0.15398, - "22": 0.15177, - "23": 0.25777, - "24": 0.15477, - "25": 0.15108, - "26": 0.25255, - "27": 0.23256, - "28": 0.16156, - "29": 0.33982, - "30": 0.15402, - "31": 0.15482, - "32": 0.15494, - "33": 0.15494, - "34": 0.15448, - "35": 0.15383, - "36": 0.15383, - "37": 0.15343, - "38": 0.15387, - "39": 0.15805, - "40": 0.15334, - "41": 0.24971, - "42": 0.15713, - "43": 0.22532, - "44": 0.15365, - "45": 0.41087, - "46": 0.15392, - "47": 0.15221, - "48": 0.23644, - "49": 0.1534, - "50": 0.15283 + "1": "nan", + "2": 2.74426, + "3": 0.17387, + "4": 0.16297, + "5": 0.16176, + "6": 0.16395, + "7": 0.16386, + "8": 0.16289, + "9": 0.16327, + "10": 0.16443, + "11": 0.16326, + "12": 0.16343, + "13": 0.16293, + "14": 0.16535, + "15": 0.16415, + "16": 0.16437, + "17": 0.16472, + "18": 0.16441, + "19": 0.1632, + "20": 0.16239, + "21": 0.1634, + "22": 0.16387, + "23": 0.16433, + "24": 0.1624, + "25": 0.16133, + "26": 0.16119, + "27": 0.16136, + "28": 0.16141, + "29": 0.1614, + "30": 0.16073, + "31": 0.16146, + "32": 0.16038, + "33": 0.16019, + "34": 0.16109, + "35": 0.16035, + "36": 0.15933, + "37": 0.15978, + "38": 0.17485, + "39": 0.15932, + "40": 0.15877, + "41": 0.15919, + "42": 0.15903, + "43": 0.1594, + "44": 0.15734, + "45": 0.15857, + "46": 0.15791, + "47": 0.15837, + "48": 0.15781, + "49": 0.15813, + "50": 0.15862 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json index dac3e5ef607..31729dd5fe5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6.33065, - "2": 0.20464, - "3": 0.17836, - "4": 0.16429, - "5": 0.1621, - "6": 0.16051, - "7": 0.15983, - "8": 0.16067, - "9": 0.15721, - "10": 0.16774, - "11": 0.16215, - "12": 0.21737, - "13": 0.16028, - "14": 0.16036, - "15": 0.15885, - "16": 0.22707, - "17": 0.16509, - "18": 0.1691, - "19": 0.16736, - "20": 0.23508, - "21": 0.16682, - "22": 0.16204, - "23": 0.16527, - "24": 0.1694, - "25": 0.16972, - "26": 0.17668, - "27": 0.15612, - "28": 0.22357, - "29": 0.15777, - "30": 0.16518, - "31": 0.17111, - "32": 0.17188, - "33": 0.16413, - "34": 0.16509, - "35": 0.16886, - "36": 0.16871, - "37": 0.17188, - "38": 0.16901, - "39": 0.1672, - "40": 0.22409, - "41": 0.16827, - "42": 0.16744, - "43": 0.1668, - "44": 0.16817, - "45": 0.16681, - "46": 0.17004, - "47": 0.1702, - "48": 0.17085, - "49": 0.17174, - "50": 0.16979 + "1": "nan", + "2": 2.75152, + "3": 0.1678, + "4": 0.1543, + "5": 0.15772, + "6": 0.15798, + "7": 0.15886, + "8": 0.16038, + "9": 0.15983, + "10": 0.16009, + "11": 0.15881, + "12": 0.16004, + "13": 0.15648, + "14": 0.15396, + "15": 0.15394, + "16": 0.1544, + "17": 0.15329, + "18": 0.1539, + "19": 0.15442, + "20": 0.1521, + "21": 0.15368, + "22": 0.15287, + "23": 0.15397, + "24": 0.15553, + "25": 0.15617, + "26": 0.15925, + "27": 0.145, + "28": 0.14456, + "29": 0.14869, + "30": 0.15407, + "31": 0.15556, + "32": 0.15651, + "33": 0.15726, + "34": 0.1574, + "35": 0.15981, + "36": 0.16037, + "37": 0.16044, + "38": 0.15744, + "39": 0.15875, + "40": 0.15964, + "41": 0.15984, + "42": 0.1605, + "43": 0.15901, + "44": 0.16037, + "45": 0.1616, + "46": 0.16046, + "47": 0.16125, + "48": 0.16168, + "49": 0.1611, + "50": 0.15977 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..49a5e4f8a21 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": 10.9101, + "27": 10.90559, + "28": 10.87901, + "29": 10.87862, + "30": 10.82431, + "31": 10.7917, + "32": 10.85763, + "33": 10.85278, + "34": 10.80465, + "35": 10.81124, + "36": 10.79299, + "37": 10.82161, + "38": 10.74654, + "39": 10.79066, + "40": 10.67639, + "41": 10.71189, + "42": 10.72663, + "43": 10.58635, + "44": 10.63487, + "45": 10.59555, + "46": 10.58202, + "47": 10.67878, + "48": 10.55683, + "49": 10.43321, + "50": 10.57623 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": 22611998.0, + "27": 22532132.0, + "28": 22516960.0, + "29": 22593572.0, + "30": 22695024.0, + "31": 23019244.0, + "32": 22648204.0, + "33": 22623192.0, + "34": 22899922.0, + "35": 22852560.0, + "36": 22652964.0, + "37": 22559866.0, + "38": 22960222.0, + "39": 22864432.0, + "40": 22721420.0, + "41": 22722086.0, + "42": 22730128.0, + "43": 23040178.0, + "44": 22809816.0, + "45": 22738252.0, + "46": 22947510.0, + "47": 22697018.0, + "48": 22992168.0, + "49": 22790946.0, + "50": 22969044.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": 389056000.0, + "27": 389056000.0, + "28": 389056000.0, + "29": 389056000.0, + "30": 389056000.0, + "31": 389056000.0, + "32": 389056000.0, + "33": 389056000.0, + "34": 389056000.0, + "35": 389056000.0, + "36": 389056000.0, + "37": 389056000.0, + "38": 389056000.0, + "39": 389056000.0, + "40": 389056000.0, + "41": 389056000.0, + "42": 389056000.0, + "43": 389056000.0, + "44": 389056000.0, + "45": 389056000.0, + "46": 389056000.0, + "47": 389056000.0, + "48": 389056000.0, + "49": 389056000.0, + "50": 389056000.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": 1247206912.0, + "27": 1247207936.0, + "28": 1247207936.0, + "29": 1247207936.0, + "30": 1247207936.0, + "31": 1247207936.0, + "32": 1247207936.0, + "33": 1247207936.0, + "34": 1247207936.0, + "35": 1247207936.0, + "36": 1247207936.0, + "37": 1247207936.0, + "38": 1247207936.0, + "39": 1247207936.0, + "40": 1247207936.0, + "41": 1247207936.0, + "42": 1247207936.0, + "43": 1247207936.0, + "44": 1247207936.0, + "45": 1247207936.0, + "46": 1247207936.0, + "47": 1247207936.0, + "48": 1247207936.0, + "49": 1247207936.0, + "50": 1247207936.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": 2.4291, + "28": 0.15494, + "29": 0.14099, + "30": 0.13913, + "31": 0.1391, + "32": 0.13835, + "33": 0.13909, + "34": 0.13882, + "35": 0.13918, + "36": 0.13936, + "37": 0.1396, + "38": 0.14038, + "39": 0.14154, + "40": 0.14205, + "41": 0.14186, + "42": 0.1401, + "43": 0.14017, + "44": 0.14, + "45": 0.13933, + "46": 0.13921, + "47": 0.13941, + "48": 0.13867, + "49": 0.14055, + "50": 0.14041 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json index 941c681adde..caa1e54ee64 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 6.23525, - "2": 0.24353, - "3": 0.25343, - "4": 0.21688, - "5": 0.2509, - "6": 0.23286, - "7": 0.23132, - "8": 0.23275, - "9": 0.23174, - "10": 0.28716, - "11": 0.23191, - "12": 0.23535, - "13": 0.26183, - "14": 0.23439, - "15": 0.26372, - "16": 0.27689, - "17": 0.31573, - "18": 0.29419, - "19": 0.22489, - "20": 0.21688, - "21": 0.21286, - "22": 0.21368, - "23": 0.21212, - "24": 0.21109, - "25": 0.21238, - "26": 0.21136, - "27": 0.24254, - "28": 0.21046, - "29": 0.21055, - "30": 0.37172, - "31": 0.20753, - "32": 0.22054, - "33": 0.20088, - "34": 0.20169, - "35": 0.2243, - "36": 0.20027, - "37": 0.20099, - "38": 0.21205, - "39": 0.20018, - "40": 0.19821, - "41": 0.20033, - "42": 0.20078, - "43": 0.19985, - "44": 0.19983, - "45": 0.19756, - "46": 0.19892, - "47": 0.19813, - "48": 0.19885, - "49": 0.19949, - "50": 0.19861, - "51": 0.20481, - "52": 0.18697, - "53": 0.18628, - "54": 0.18383, - "55": 0.22054, - "56": 0.18628, - "57": 0.1865, - "58": 0.23363, - "59": 0.18779, - "60": 0.18548, - "61": 0.23086, - "62": 0.18486, - "63": 0.18676, - "64": 0.18877, - "65": 0.18818, - "66": 0.18785, - "67": 0.18912, - "68": 0.18762, - "69": 0.18502, - "70": 0.2393, - "71": 0.18534, - "72": 0.1866, - "73": 0.18699, - "74": 0.2218, - "75": 0.18851, - "76": 0.18761, - "77": 0.18836, - "78": 0.22737, - "79": 0.18832, - "80": 0.18852, - "81": 0.2185, - "82": 0.18552, - "83": 0.19385, - "84": 0.18774, - "85": 0.1898, - "86": 0.3457, - "87": 0.4164, - "88": 0.18999, - "89": 0.1872, - "90": 0.18803, - "91": 0.22713, - "92": 0.18693, - "93": 0.18603, - "94": 0.18711, - "95": 0.18552, - "96": 0.22396, - "97": 0.18576, - "98": 0.18988, - "99": 0.21054, - "100": 0.21361 + "1": "nan", + "2": 2.48366, + "3": 0.20961, + "4": 0.19355, + "5": 0.19146, + "6": 0.19108, + "7": 0.19236, + "8": 0.19259, + "9": 0.19267, + "10": 0.19436, + "11": 0.19257, + "12": 0.19432, + "13": 0.19332, + "14": 0.19442, + "15": 0.19393, + "16": 0.19417, + "17": 0.19555, + "18": 0.19451, + "19": 0.19452, + "20": 0.19555, + "21": 0.19375, + "22": 0.19402, + "23": 0.19539, + "24": 0.19475, + "25": 0.19576, + "26": 0.19424, + "27": 0.19514, + "28": 0.19519, + "29": 0.19578, + "30": 0.19503, + "31": 0.19394, + "32": 0.19582, + "33": 0.19444, + "34": 0.19405, + "35": 0.19498, + "36": 0.19463, + "37": 0.19572, + "38": 0.19362, + "39": 0.19492, + "40": 0.19487, + "41": 0.19497, + "42": 0.19617, + "43": 0.19571, + "44": 0.19661, + "45": 0.19634, + "46": 0.19537, + "47": 0.19646, + "48": 0.19658, + "49": 0.19727, + "50": 0.19567, + "51": 0.21203, + "52": 0.19551, + "53": 0.19415, + "54": 0.19434, + "55": 0.19584, + "56": 0.19437, + "57": 0.19536, + "58": 0.20364, + "59": 0.20029, + "60": 0.1929, + "61": 0.19274, + "62": 0.19364, + "63": 0.19667, + "64": 0.19406, + "65": 0.19781, + "66": 0.19435, + "67": 0.19308, + "68": 0.1932, + "69": 0.19478, + "70": 0.19591, + "71": 0.19922, + "72": 0.19646, + "73": 0.19646, + "74": 0.19739, + "75": 0.19817, + "76": 0.20056, + "77": 0.19655, + "78": 0.19459, + "79": 0.19478, + "80": 0.19638, + "81": 0.19329, + "82": 0.19254, + "83": 0.19379, + "84": 0.19435, + "85": 0.19517, + "86": 0.19446, + "87": 0.19464, + "88": 0.19501, + "89": 0.19544, + "90": 0.19268, + "91": 0.19425, + "92": 0.1933, + "93": 0.19366, + "94": 0.19328, + "95": 0.19408, + "96": 0.19474, + "97": 0.19719, + "98": 0.19535, + "99": 0.19604, + "100": 0.19554 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..848d772bc72 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.84743, + "52": 9.74209, + "53": 10.05697, + "54": 9.9505, + "55": 9.88145, + "56": 9.61274, + "57": 9.4687, + "58": 9.82193, + "59": 9.57642, + "60": 9.49762, + "61": 9.69189, + "62": 9.9867, + "63": 9.37512, + "64": 9.76679, + "65": 8.94648, + "66": 9.7023, + "67": 9.36326, + "68": 9.7831, + "69": 9.7986, + "70": 9.7317, + "71": 9.62571, + "72": 9.58488, + "73": 9.48967, + "74": 8.9286, + "75": 9.40862, + "76": 9.07925, + "77": 10.0594, + "78": 9.72288, + "79": 9.37784, + "80": 9.40429, + "81": 9.48309, + "82": 9.7004, + "83": 9.31595, + "84": 9.41838, + "85": 9.61685, + "86": 9.07533, + "87": 9.59616, + "88": 9.75215, + "89": 9.60184, + "90": 9.82281, + "91": 9.34037, + "92": 9.35854, + "93": 9.08805, + "94": 8.83037, + "95": 9.5266, + "96": 9.53049, + "97": 9.30389, + "98": 9.67196, + "99": 8.89637, + "100": 9.40644 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2491.0, + "52": 2552.0, + "53": 2980.0, + "54": 2680.0, + "55": 2274.0, + "56": 2734.0, + "57": 2319.0, + "58": 2907.0, + "59": 2886.0, + "60": 2566.0, + "61": 2855.0, + "62": 2704.0, + "63": 2370.0, + "64": 2998.0, + "65": 2563.0, + "66": 2868.0, + "67": 2762.0, + "68": 2739.0, + "69": 2730.0, + "70": 3156.0, + "71": 2803.0, + "72": 2506.0, + "73": 2896.0, + "74": 1937.0, + "75": 2450.0, + "76": 2794.0, + "77": 3047.0, + "78": 3104.0, + "79": 3069.0, + "80": 3286.0, + "81": 3543.0, + "82": 3192.0, + "83": 2614.0, + "84": 3273.0, + "85": 3111.0, + "86": 2680.0, + "87": 3654.0, + "88": 3117.0, + "89": 3351.0, + "90": 3086.0, + "91": 2721.0, + "92": 3045.0, + "93": 2672.0, + "94": 3326.0, + "95": 3125.0, + "96": 3309.0, + "97": 3208.0, + "98": 3572.0, + "99": 2980.0, + "100": 3355.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 515933696.0, + "52": 515933696.0, + "53": 515933696.0, + "54": 515933696.0, + "55": 515933696.0, + "56": 515933696.0, + "57": 515933696.0, + "58": 515933696.0, + "59": 515933696.0, + "60": 515933696.0, + "61": 515933696.0, + "62": 515933696.0, + "63": 515933696.0, + "64": 515933696.0, + "65": 515933696.0, + "66": 515933696.0, + "67": 515933696.0, + "68": 515933696.0, + "69": 515933696.0, + "70": 515933696.0, + "71": 515933696.0, + "72": 515933696.0, + "73": 515933696.0, + "74": 515933696.0, + "75": 515933696.0, + "76": 515933696.0, + "77": 515933696.0, + "78": 515933696.0, + "79": 515933696.0, + "80": 515933696.0, + "81": 515933696.0, + "82": 515933696.0, + "83": 515933696.0, + "84": 515933696.0, + "85": 515933696.0, + "86": 515933696.0, + "87": 515933696.0, + "88": 515933696.0, + "89": 515933696.0, + "90": 515933696.0, + "91": 515933696.0, + "92": 515933696.0, + "93": 515933696.0, + "94": 515933696.0, + "95": 515933696.0, + "96": 515933696.0, + "97": 515933696.0, + "98": 515933696.0, + "99": 515933696.0, + "100": 515933696.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1440754176.0, + "52": 1440755200.0, + "53": 1440755200.0, + "54": 1440755200.0, + "55": 1440755200.0, + "56": 1440755200.0, + "57": 1440755200.0, + "58": 1440755200.0, + "59": 1440755200.0, + "60": 1440755200.0, + "61": 1440755200.0, + "62": 1440755200.0, + "63": 1440755200.0, + "64": 1440755200.0, + "65": 1440755200.0, + "66": 1440755200.0, + "67": 1440755200.0, + "68": 1440755200.0, + "69": 1440755200.0, + "70": 1440755200.0, + "71": 1440755200.0, + "72": 1440755200.0, + "73": 1440755200.0, + "74": 1440755200.0, + "75": 1440755200.0, + "76": 1440755200.0, + "77": 1440755200.0, + "78": 1440755200.0, + "79": 1440755200.0, + "80": 1440755200.0, + "81": 1440755200.0, + "82": 1440755200.0, + "83": 1440755200.0, + "84": 1440755200.0, + "85": 1440755200.0, + "86": 1440755200.0, + "87": 1440755200.0, + "88": 1440755200.0, + "89": 1440755200.0, + "90": 1440755200.0, + "91": 1440755200.0, + "92": 1440755200.0, + "93": 1440755200.0, + "94": 1440755200.0, + "95": 1440755200.0, + "96": 1440755200.0, + "97": 1440755200.0, + "98": 1440755200.0, + "99": 1440755200.0, + "100": 1440755200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.50794, + "53": 0.19671, + "54": 0.18376, + "55": 0.18594, + "56": 0.18674, + "57": 0.18474, + "58": 0.18412, + "59": 0.18456, + "60": 0.18456, + "61": 0.18623, + "62": 0.18524, + "63": 0.18624, + "64": 0.18621, + "65": 0.18695, + "66": 0.18541, + "67": 0.1857, + "68": 0.18575, + "69": 0.18658, + "70": 0.1875, + "71": 0.18753, + "72": 0.18718, + "73": 0.18797, + "74": 0.18972, + "75": 0.18765, + "76": 0.18764, + "77": 0.18827, + "78": 0.18801, + "79": 0.18785, + "80": 0.18903, + "81": 0.18889, + "82": 0.18772, + "83": 0.18876, + "84": 0.18791, + "85": 0.18973, + "86": 0.18948, + "87": 0.18998, + "88": 0.18905, + "89": 0.1898, + "90": 0.1895, + "91": 0.18953, + "92": 0.18969, + "93": 0.18888, + "94": 0.18888, + "95": 0.18773, + "96": 0.18832, + "97": 0.18919, + "98": 0.189, + "99": 0.1888, + "100": 0.188 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json index 500fc1be7cf..bffbd713b20 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json @@ -6,104 +6,104 @@ "values": { "1": 10.86244, "2": 10.88582, - "3": 10.84735, - "4": 10.85573, - "5": 10.86001, - "6": 10.87731, - "7": 10.86558, - "8": 10.84914, - "9": 10.86606, - "10": 10.82476, - "11": 10.85615, - "12": 10.85374, - "13": 10.8679, - "14": 10.87118, - "15": 10.82236, - "16": 10.79992, - "17": 10.77431, - "18": 10.78349, - "19": 10.79309, + "3": 10.84733, + "4": 10.85571, + "5": 10.86009, + "6": 10.87728, + "7": 10.86559, + "8": 10.84911, + "9": 10.86605, + "10": 10.82478, + "11": 10.85618, + "12": 10.85375, + "13": 10.86786, + "14": 10.87115, + "15": 10.82231, + "16": 10.79989, + "17": 10.77429, + "18": 10.78346, + "19": 10.79306, "20": 10.68226, - "21": 10.64711, - "22": 10.5092, - "23": 10.66829, - "24": 10.54196, - "25": 10.49278, - "26": 10.55935, - "27": 10.54234, + "21": 10.64716, + "22": 10.50918, + "23": 10.66831, + "24": 10.54197, + "25": 10.49277, + "26": 10.55929, + "27": 10.54236, "28": 10.5113, - "29": 10.53259, - "30": 10.28989, - "31": 10.0285, - "32": 10.38878, - "33": 10.39596, - "34": 10.13451, - "35": 10.18928, - "36": 10.13355, - "37": 10.2738, - "38": 10.10751, - "39": 10.3401, - "40": 9.98543, - "41": 10.06416, - "42": 10.13751, - "43": 9.73383, - "44": 9.86311, - "45": 9.73722, - "46": 9.71346, + "29": 10.53258, + "30": 10.28987, + "31": 10.02854, + "32": 10.38881, + "33": 10.39595, + "34": 10.13452, + "35": 10.1893, + "36": 10.13353, + "37": 10.27381, + "38": 10.10749, + "39": 10.34007, + "40": 9.98535, + "41": 10.06411, + "42": 10.13748, + "43": 9.73379, + "44": 9.86307, + "45": 9.73726, + "46": 9.71341, "47": 10.07754, - "48": 9.76768, - "49": 9.41986, - "50": 9.81686, - "51": 9.77423, - "52": 9.66446, - "53": 10.00148, - "54": 9.89157, - "55": 9.8185, - "56": 9.54335, - "57": 9.39451, - "58": 9.76569, - "59": 9.50934, - "60": 9.42824, + "48": 9.76762, + "49": 9.41989, + "50": 9.81685, + "51": 9.7742, + "52": 9.66444, + "53": 10.00151, + "54": 9.89155, + "55": 9.81852, + "56": 9.54337, + "57": 9.39452, + "58": 9.76573, + "59": 9.50935, + "60": 9.42821, "61": 9.63468, - "62": 9.93888, + "62": 9.93891, "63": 9.30458, "64": 9.70984, - "65": 8.86892, - "66": 9.64956, - "67": 9.30818, - "68": 9.73508, - "69": 9.75593, - "70": 9.68707, - "71": 9.57532, - "72": 9.53074, - "73": 9.43675, - "74": 8.85588, - "75": 9.35531, - "76": 9.01375, - "77": 10.0245, - "78": 9.68203, - "79": 9.33141, - "80": 9.35466, - "81": 9.43622, - "82": 9.65854, - "83": 9.26268, - "84": 9.3692, - "85": 9.57098, - "86": 9.03323, - "87": 9.55969, - "88": 9.71078, - "89": 9.5541, + "65": 8.86888, + "66": 9.64952, + "67": 9.30815, + "68": 9.73505, + "69": 9.75596, + "70": 9.68706, + "71": 9.57535, + "72": 9.53075, + "73": 9.43678, + "74": 8.85586, + "75": 9.35532, + "76": 9.01377, + "77": 10.02449, + "78": 9.68205, + "79": 9.33138, + "80": 9.35467, + "81": 9.43621, + "82": 9.65855, + "83": 9.26269, + "84": 9.36921, + "85": 9.57104, + "86": 9.0332, + "87": 9.55973, + "88": 9.71077, + "89": 9.55411, "90": 9.78662, - "91": 9.2909, + "91": 9.29091, "92": 9.31236, "93": 9.03976, - "94": 8.78109, - "95": 9.49172, - "96": 9.49067, - "97": 9.25826, - "98": 9.62998, - "99": 8.84685, - "100": 9.36201 + "94": 8.78112, + "95": 9.49175, + "96": 9.49071, + "97": 9.25827, + "98": 9.63001, + "99": 8.84688, + "100": 9.36199 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 571.0, - "2": 604.0, - "3": 655.0, - "4": 633.0, - "5": 602.0, - "6": 699.0, - "7": 585.0, - "8": 589.0, - "9": 633.0, - "10": 527.0, - "11": 628.0, - "12": 602.0, - "13": 671.0, - "14": 627.0, - "15": 591.0, - "16": 605.0, - "17": 666.0, - "18": 604.0, - "19": 631.0, - "20": 604.0, - "21": 706.0, - "22": 598.0, - "23": 682.0, - "24": 656.0, - "25": 593.0, - "26": 615.0, - "27": 681.0, - "28": 693.0, - "29": 701.0, - "30": 699.0, - "31": 564.0, - "32": 781.0, - "33": 724.0, - "34": 679.0, - "35": 711.0, + "1": 562.0, + "2": 595.0, + "3": 621.0, + "4": 618.0, + "5": 626.0, + "6": 671.0, + "7": 641.0, + "8": 647.0, + "9": 601.0, + "10": 522.0, + "11": 648.0, + "12": 639.0, + "13": 681.0, + "14": 686.0, + "15": 638.0, + "16": 620.0, + "17": 601.0, + "18": 556.0, + "19": 628.0, + "20": 551.0, + "21": 695.0, + "22": 620.0, + "23": 708.0, + "24": 553.0, + "25": 566.0, + "26": 597.0, + "27": 672.0, + "28": 717.0, + "29": 763.0, + "30": 694.0, + "31": 627.0, + "32": 696.0, + "33": 820.0, + "34": 674.0, + "35": 741.0, "36": 733.0, - "37": 858.0, - "38": 794.0, - "39": 789.0, - "40": 857.0, - "41": 739.0, - "42": 856.0, - "43": 742.0, - "44": 798.0, - "45": 772.0, - "46": 872.0, - "47": 941.0, - "48": 838.0, - "49": 799.0, - "50": 840.0, - "51": 961.0, - "52": 952.0, - "53": 1057.0, - "54": 932.0, - "55": 849.0, - "56": 986.0, - "57": 853.0, - "58": 963.0, - "59": 1059.0, - "60": 895.0, - "61": 999.0, - "62": 967.0, + "37": 848.0, + "38": 788.0, + "39": 863.0, + "40": 812.0, + "41": 813.0, + "42": 812.0, + "43": 706.0, + "44": 810.0, + "45": 732.0, + "46": 863.0, + "47": 914.0, + "48": 886.0, + "49": 786.0, + "50": 872.0, + "51": 952.0, + "52": 963.0, + "53": 1095.0, + "54": 956.0, + "55": 844.0, + "56": 969.0, + "57": 831.0, + "58": 985.0, + "59": 1062.0, + "60": 868.0, + "61": 975.0, + "62": 897.0, "63": 928.0, - "64": 1046.0, - "65": 974.0, - "66": 998.0, - "67": 1078.0, - "68": 987.0, - "69": 976.0, - "70": 1112.0, - "71": 1031.0, - "72": 889.0, - "73": 1009.0, - "74": 778.0, - "75": 839.0, - "76": 1017.0, - "77": 1069.0, - "78": 1111.0, - "79": 1041.0, - "80": 1089.0, - "81": 1169.0, - "82": 1034.0, - "83": 951.0, - "84": 1098.0, - "85": 1124.0, - "86": 816.0, - "87": 1218.0, - "88": 1128.0, - "89": 1147.0, - "90": 1130.0, - "91": 1096.0, - "92": 1132.0, - "93": 900.0, - "94": 1119.0, - "95": 1095.0, - "96": 1160.0, - "97": 1006.0, - "98": 1240.0, - "99": 1141.0, - "100": 1108.0 + "64": 1085.0, + "65": 1058.0, + "66": 1068.0, + "67": 966.0, + "68": 999.0, + "69": 1021.0, + "70": 1103.0, + "71": 1068.0, + "72": 884.0, + "73": 1027.0, + "74": 757.0, + "75": 818.0, + "76": 981.0, + "77": 1091.0, + "78": 1135.0, + "79": 1105.0, + "80": 1126.0, + "81": 1181.0, + "82": 1095.0, + "83": 981.0, + "84": 1154.0, + "85": 1139.0, + "86": 804.0, + "87": 1216.0, + "88": 1139.0, + "89": 1113.0, + "90": 1071.0, + "91": 1180.0, + "92": 1100.0, + "93": 846.0, + "94": 1155.0, + "95": 1071.0, + "96": 1123.0, + "97": 1074.0, + "98": 1188.0, + "99": 1161.0, + "100": 1153.0 } }, "mem-allocated-bytes": { @@ -218,106 +218,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 459571712.0, - "2": 459571712.0, - "3": 459571712.0, - "4": 459571712.0, - "5": 459571712.0, - "6": 459571712.0, - "7": 459571712.0, - "8": 459571712.0, - "9": 459571712.0, - "10": 459571712.0, - "11": 459571712.0, - "12": 459571712.0, - "13": 459571712.0, - "14": 459571712.0, - "15": 459571712.0, - "16": 459571712.0, - "17": 459571712.0, - "18": 459571712.0, - "19": 459571712.0, - "20": 459571712.0, - "21": 459571712.0, - "22": 459571712.0, - "23": 459571712.0, - "24": 459571712.0, - "25": 459571712.0, - "26": 459571712.0, - "27": 459571712.0, - "28": 459571712.0, - "29": 459571712.0, - "30": 459571712.0, - "31": 459571712.0, - "32": 459571712.0, - "33": 459571712.0, - "34": 459571712.0, - "35": 459571712.0, - "36": 459571712.0, - "37": 459571712.0, - "38": 459571712.0, - "39": 459571712.0, - "40": 459571712.0, - "41": 459571712.0, - "42": 459571712.0, - "43": 459571712.0, - "44": 459571712.0, - "45": 459571712.0, - "46": 459571712.0, - "47": 459571712.0, - "48": 459571712.0, - "49": 459571712.0, - "50": 459571712.0, - "51": 459571712.0, - "52": 459571712.0, - "53": 459571712.0, - "54": 459571712.0, - "55": 459571712.0, - "56": 459571712.0, - "57": 459571712.0, - "58": 459571712.0, - "59": 459571712.0, - "60": 459571712.0, - "61": 459571712.0, - "62": 459571712.0, - "63": 459571712.0, - "64": 459571712.0, - "65": 459571712.0, - "66": 459571712.0, - "67": 459571712.0, - "68": 459571712.0, - "69": 459571712.0, - "70": 459571712.0, - "71": 459571712.0, - "72": 459571712.0, - "73": 459571712.0, - "74": 459571712.0, - "75": 459571712.0, - "76": 459571712.0, - "77": 459571712.0, - "78": 459571712.0, - "79": 459571712.0, - "80": 459571712.0, - "81": 459571712.0, - "82": 459571712.0, - "83": 459571712.0, - "84": 459571712.0, - "85": 459571712.0, - "86": 459571712.0, - "87": 459571712.0, - "88": 459571712.0, - "89": 459571712.0, - "90": 459571712.0, - "91": 459571712.0, - "92": 459571712.0, - "93": 459571712.0, - "94": 459571712.0, - "95": 459571712.0, - "96": 459571712.0, - "97": 459571712.0, - "98": 459571712.0, - "99": 459571712.0, - "100": 459571712.0 + "1": 462062080.0, + "2": 462062080.0, + "3": 462062080.0, + "4": 462062080.0, + "5": 462062080.0, + "6": 462062080.0, + "7": 462062080.0, + "8": 462062080.0, + "9": 462062080.0, + "10": 462062080.0, + "11": 462062080.0, + "12": 462062080.0, + "13": 462062080.0, + "14": 462062080.0, + "15": 462062080.0, + "16": 462062080.0, + "17": 462062080.0, + "18": 462062080.0, + "19": 462062080.0, + "20": 462062080.0, + "21": 462062080.0, + "22": 462062080.0, + "23": 462062080.0, + "24": 462062080.0, + "25": 462062080.0, + "26": 462062080.0, + "27": 462062080.0, + "28": 462062080.0, + "29": 462062080.0, + "30": 462062080.0, + "31": 462062080.0, + "32": 462062080.0, + "33": 462062080.0, + "34": 462062080.0, + "35": 462062080.0, + "36": 462062080.0, + "37": 462062080.0, + "38": 462062080.0, + "39": 462062080.0, + "40": 462062080.0, + "41": 462062080.0, + "42": 462062080.0, + "43": 462062080.0, + "44": 462062080.0, + "45": 462062080.0, + "46": 462062080.0, + "47": 462062080.0, + "48": 462062080.0, + "49": 462062080.0, + "50": 462062080.0, + "51": 462062080.0, + "52": 462062080.0, + "53": 462062080.0, + "54": 462062080.0, + "55": 462062080.0, + "56": 462062080.0, + "57": 462062080.0, + "58": 462062080.0, + "59": 462062080.0, + "60": 462062080.0, + "61": 462062080.0, + "62": 462062080.0, + "63": 462062080.0, + "64": 462062080.0, + "65": 462062080.0, + "66": 462062080.0, + "67": 462062080.0, + "68": 462062080.0, + "69": 462062080.0, + "70": 462062080.0, + "71": 462062080.0, + "72": 462062080.0, + "73": 462062080.0, + "74": 462062080.0, + "75": 462062080.0, + "76": 462062080.0, + "77": 462062080.0, + "78": 462062080.0, + "79": 462062080.0, + "80": 462062080.0, + "81": 462062080.0, + "82": 462062080.0, + "83": 462062080.0, + "84": 462062080.0, + "85": 462062080.0, + "86": 462062080.0, + "87": 462062080.0, + "88": 462062080.0, + "89": 462062080.0, + "90": 462062080.0, + "91": 462062080.0, + "92": 462062080.0, + "93": 462062080.0, + "94": 462062080.0, + "95": 462062080.0, + "96": 462062080.0, + "97": 462062080.0, + "98": 462062080.0, + "99": 462062080.0, + "100": 462062080.0 } }, "mem-max-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 705635840.0, - "2": 883610112.0, - "3": 883610624.0, - "4": 883610624.0, - "5": 883610624.0, - "6": 884657152.0, - "7": 884657152.0, - "8": 884657152.0, - "9": 884657152.0, - "10": 884657152.0, - "11": 884657152.0, - "12": 884657152.0, - "13": 884657152.0, - "14": 884657152.0, - "15": 884659712.0, - "16": 884659712.0, - "17": 884659712.0, - "18": 884659712.0, - "19": 884659712.0, - "20": 884659712.0, - "21": 884659712.0, - "22": 884659712.0, - "23": 884659712.0, - "24": 884659712.0, - "25": 884659712.0, - "26": 884659712.0, - "27": 884659712.0, - "28": 884659712.0, - "29": 884659712.0, - "30": 884659712.0, - "31": 884659712.0, - "32": 884659712.0, - "33": 884659712.0, - "34": 884659712.0, - "35": 884659712.0, - "36": 884659712.0, - "37": 884659712.0, - "38": 884659712.0, - "39": 884659712.0, - "40": 884659712.0, - "41": 884659712.0, - "42": 884659712.0, - "43": 884659712.0, - "44": 884659712.0, - "45": 884659712.0, - "46": 884659712.0, - "47": 884659712.0, - "48": 884659712.0, - "49": 884659712.0, - "50": 884659712.0, - "51": 884659712.0, - "52": 884659712.0, - "53": 884659712.0, - "54": 884659712.0, - "55": 884659712.0, - "56": 884659712.0, - "57": 884659712.0, - "58": 884659712.0, - "59": 884659712.0, - "60": 884659712.0, - "61": 884659712.0, - "62": 884659712.0, - "63": 884659712.0, - "64": 884659712.0, - "65": 884659712.0, - "66": 884659712.0, - "67": 884659712.0, - "68": 884659712.0, - "69": 884659712.0, - "70": 884659712.0, - "71": 884659712.0, - "72": 884659712.0, - "73": 884659712.0, - "74": 884659712.0, - "75": 884659712.0, - "76": 884659712.0, - "77": 884659712.0, - "78": 884659712.0, - "79": 884659712.0, - "80": 884659712.0, - "81": 884659712.0, - "82": 884659712.0, - "83": 884659712.0, - "84": 884659712.0, - "85": 884659712.0, - "86": 884659712.0, - "87": 884659712.0, - "88": 884659712.0, - "89": 884659712.0, - "90": 884659712.0, - "91": 884659712.0, - "92": 884659712.0, - "93": 884659712.0, - "94": 884659712.0, - "95": 884659712.0, - "96": 884659712.0, - "97": 884659712.0, - "98": 884659712.0, - "99": 884659712.0, - "100": 884659712.0 + "1": 703538688.0, + "2": 884528640.0, + "3": 884528640.0, + "4": 884528640.0, + "5": 884528640.0, + "6": 884528640.0, + "7": 884528640.0, + "8": 884528640.0, + "9": 884528640.0, + "10": 884528640.0, + "11": 884528640.0, + "12": 884528640.0, + "13": 884528640.0, + "14": 884528640.0, + "15": 884528640.0, + "16": 884528640.0, + "17": 884528640.0, + "18": 884528640.0, + "19": 884528640.0, + "20": 884528640.0, + "21": 884528640.0, + "22": 884528640.0, + "23": 884528640.0, + "24": 884528640.0, + "25": 884528640.0, + "26": 884528640.0, + "27": 884528640.0, + "28": 884528640.0, + "29": 884528640.0, + "30": 884528640.0, + "31": 884528640.0, + "32": 884528640.0, + "33": 884528640.0, + "34": 884528640.0, + "35": 884528640.0, + "36": 884528640.0, + "37": 884528640.0, + "38": 884528640.0, + "39": 884528640.0, + "40": 884528640.0, + "41": 884528640.0, + "42": 884528640.0, + "43": 884528640.0, + "44": 884528640.0, + "45": 884528640.0, + "46": 884528640.0, + "47": 884528640.0, + "48": 884528640.0, + "49": 884528640.0, + "50": 884528640.0, + "51": 884528640.0, + "52": 884528640.0, + "53": 884528640.0, + "54": 884528640.0, + "55": 884528640.0, + "56": 884528640.0, + "57": 884528640.0, + "58": 884528640.0, + "59": 884528640.0, + "60": 884528640.0, + "61": 884528640.0, + "62": 884528640.0, + "63": 884528640.0, + "64": 884528640.0, + "65": 884528640.0, + "66": 885575168.0, + "67": 885575168.0, + "68": 885575168.0, + "69": 885575168.0, + "70": 885575168.0, + "71": 885575168.0, + "72": 885575168.0, + "73": 885575168.0, + "74": 885575168.0, + "75": 885575168.0, + "76": 885575168.0, + "77": 885575168.0, + "78": 885575168.0, + "79": 885575168.0, + "80": 885575168.0, + "81": 885575168.0, + "82": 885575168.0, + "83": 885575168.0, + "84": 885575168.0, + "85": 885575168.0, + "86": 885575168.0, + "87": 885575168.0, + "88": 885575168.0, + "89": 885575168.0, + "90": 885575168.0, + "91": 885575168.0, + "92": 885575168.0, + "93": 885575168.0, + "94": 885575168.0, + "95": 885575168.0, + "96": 885575168.0, + "97": 885575168.0, + "98": 885575168.0, + "99": 885575168.0, + "100": 885575168.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 13.71622, - "2": 0.68805, - "3": 0.57225, - "4": 0.54755, - "5": 0.48793, - "6": 0.52239, - "7": 0.49126, - "8": 0.49498, - "9": 0.58476, - "10": 0.4973, - "11": 0.49619, - "12": 0.49824, - "13": 0.49835, - "14": 0.49548, - "15": 0.49404, - "16": 0.50855, - "17": 0.76935, - "18": 0.49519, - "19": 0.49579, - "20": 0.50812, - "21": 0.50221, - "22": 0.49623, - "23": 0.50166, - "24": 0.4965, - "25": 0.49653, - "26": 0.56522, - "27": 0.50204, - "28": 0.4912, - "29": 0.49165, - "30": 0.49253, - "31": 0.48561, - "32": 0.50414, - "33": 0.49461, - "34": 0.48721, - "35": 0.50775, - "36": 0.5025, - "37": 0.49896, - "38": 0.50015, - "39": 0.50322, - "40": 0.51086, - "41": 0.51074, - "42": 0.49461, - "43": 0.5049, - "44": 0.47567, - "45": 0.51176, - "46": 0.51628, - "47": 0.50424, - "48": 0.50299, - "49": 0.50456, - "50": 0.51299, - "51": 0.50546, - "52": 0.48547, - "53": 0.48643, - "54": 0.49187, - "55": 0.50244, - "56": 0.5003, - "57": 0.49723, - "58": 0.5007, - "59": 0.50341, - "60": 0.49703, - "61": 0.49913, - "62": 0.48748, - "63": 0.52659, - "64": 0.49384, - "65": 0.48632, - "66": 0.49435, - "67": 0.49537, - "68": 0.49543, - "69": 0.48543, - "70": 0.49128, - "71": 0.49386, - "72": 0.49681, - "73": 0.49076, - "74": 0.50662, - "75": 0.51506, - "76": 0.51539, - "77": 0.51263, - "78": 0.51094, - "79": 0.50786, - "80": 0.85887, - "81": 0.51151, - "82": 0.50586, - "83": 0.51628, - "84": 0.48942, - "85": 0.50794, - "86": 0.45205, - "87": 0.51667, - "88": 0.52246, - "89": 0.51352, - "90": 0.48616, - "91": 0.51165, - "92": 0.52646, - "93": 0.52475, - "94": 0.50978, - "95": 0.50426, - "96": 0.50587, - "97": 0.52063, - "98": 0.52056, - "99": 0.50217, - "100": 0.50666 + "1": "nan", + "2": 3.73344, + "3": 0.435, + "4": 0.42269, + "5": 0.42143, + "6": 0.42169, + "7": 0.42023, + "8": 0.42024, + "9": 0.42222, + "10": 0.42242, + "11": 0.42053, + "12": 0.42268, + "13": 0.41982, + "14": 0.41832, + "15": 0.41832, + "16": 0.41936, + "17": 0.41957, + "18": 0.41869, + "19": 0.4182, + "20": 0.41746, + "21": 0.41737, + "22": 0.40981, + "23": 0.4096, + "24": 0.40573, + "25": 0.40471, + "26": 0.40427, + "27": 0.40639, + "28": 0.40633, + "29": 0.40533, + "30": 0.40576, + "31": 0.40376, + "32": 0.40338, + "33": 0.40605, + "34": 0.40135, + "35": 0.40398, + "36": 0.40309, + "37": 0.40852, + "38": 0.40572, + "39": 0.40092, + "40": 0.40543, + "41": 0.40495, + "42": 0.40518, + "43": 0.40074, + "44": 0.40306, + "45": 0.40179, + "46": 0.40307, + "47": 0.40246, + "48": 0.4024, + "49": 0.40234, + "50": 0.40238, + "51": 0.41546, + "52": 0.39352, + "53": 0.39475, + "54": 0.39652, + "55": 0.40055, + "56": 0.39993, + "57": 0.40166, + "58": 0.40151, + "59": 0.40191, + "60": 0.40194, + "61": 0.40084, + "62": 0.39989, + "63": 0.40157, + "64": 0.40012, + "65": 0.40034, + "66": 0.40082, + "67": 0.40008, + "68": 0.39842, + "69": 0.39844, + "70": 0.40021, + "71": 0.39935, + "72": 0.40145, + "73": 0.39804, + "74": 0.39495, + "75": 0.39605, + "76": 0.39578, + "77": 0.39653, + "78": 0.39694, + "79": 0.95682, + "80": 0.39818, + "81": 0.39646, + "82": 0.39909, + "83": 0.4044, + "84": 0.39893, + "85": 0.39807, + "86": 0.39917, + "87": 0.39513, + "88": 0.39647, + "89": 0.39761, + "90": 0.39883, + "91": 0.39867, + "92": 0.39686, + "93": 0.39611, + "94": 0.39717, + "95": 0.39645, + "96": 0.39632, + "97": 0.39808, + "98": 0.39732, + "99": 0.39829, + "100": 0.39861 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..1bf0169b170 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.7742, + "52": 9.66445, + "53": 10.00151, + "54": 9.89155, + "55": 9.81849, + "56": 9.54335, + "57": 9.39451, + "58": 9.76573, + "59": 9.5093, + "60": 9.42825, + "61": 9.63467, + "62": 9.93887, + "63": 9.30457, + "64": 9.70983, + "65": 8.86882, + "66": 9.64953, + "67": 9.3082, + "68": 9.73505, + "69": 9.7559, + "70": 9.68706, + "71": 9.57534, + "72": 9.53073, + "73": 9.43677, + "74": 8.85587, + "75": 9.35529, + "76": 9.01373, + "77": 10.02452, + "78": 9.68203, + "79": 9.33141, + "80": 9.35469, + "81": 9.43623, + "82": 9.65853, + "83": 9.26266, + "84": 9.36921, + "85": 9.571, + "86": 9.03325, + "87": 9.55972, + "88": 9.71078, + "89": 9.5541, + "90": 9.78661, + "91": 9.29086, + "92": 9.31236, + "93": 9.03977, + "94": 8.78115, + "95": 9.49176, + "96": 9.4907, + "97": 9.25833, + "98": 9.63003, + "99": 8.84687, + "100": 9.36199 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 920.0, + "52": 908.0, + "53": 1052.0, + "54": 889.0, + "55": 846.0, + "56": 985.0, + "57": 838.0, + "58": 1021.0, + "59": 1045.0, + "60": 896.0, + "61": 987.0, + "62": 926.0, + "63": 912.0, + "64": 1081.0, + "65": 991.0, + "66": 1095.0, + "67": 964.0, + "68": 938.0, + "69": 1005.0, + "70": 1013.0, + "71": 1082.0, + "72": 896.0, + "73": 1035.0, + "74": 687.0, + "75": 920.0, + "76": 1063.0, + "77": 1086.0, + "78": 1136.0, + "79": 1065.0, + "80": 1111.0, + "81": 1229.0, + "82": 1100.0, + "83": 944.0, + "84": 1182.0, + "85": 1100.0, + "86": 790.0, + "87": 1132.0, + "88": 1071.0, + "89": 1148.0, + "90": 1121.0, + "91": 1120.0, + "92": 1115.0, + "93": 944.0, + "94": 1126.0, + "95": 1116.0, + "96": 1115.0, + "97": 995.0, + "98": 1234.0, + "99": 1120.0, + "100": 1148.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 461669888.0, + "52": 461669888.0, + "53": 461669888.0, + "54": 461669888.0, + "55": 461669888.0, + "56": 461669888.0, + "57": 461669888.0, + "58": 461669888.0, + "59": 461669888.0, + "60": 461669888.0, + "61": 461669888.0, + "62": 461669888.0, + "63": 461669888.0, + "64": 461669888.0, + "65": 461669888.0, + "66": 461669888.0, + "67": 461669888.0, + "68": 461669888.0, + "69": 461669888.0, + "70": 461669888.0, + "71": 461669888.0, + "72": 461669888.0, + "73": 461669888.0, + "74": 461669888.0, + "75": 461669888.0, + "76": 461669888.0, + "77": 461669888.0, + "78": 461669888.0, + "79": 461669888.0, + "80": 461669888.0, + "81": 461669888.0, + "82": 461669888.0, + "83": 461669888.0, + "84": 461669888.0, + "85": 461669888.0, + "86": 461669888.0, + "87": 461669888.0, + "88": 461669888.0, + "89": 461669888.0, + "90": 461669888.0, + "91": 461669888.0, + "92": 461669888.0, + "93": 461669888.0, + "94": 461669888.0, + "95": 461669888.0, + "96": 461669888.0, + "97": 461669888.0, + "98": 461669888.0, + "99": 461669888.0, + "100": 461669888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 889377280.0, + "52": 889378304.0, + "53": 889378304.0, + "54": 889378304.0, + "55": 889378304.0, + "56": 889378304.0, + "57": 889378304.0, + "58": 889378304.0, + "59": 889378304.0, + "60": 889378304.0, + "61": 889378304.0, + "62": 889378304.0, + "63": 889378304.0, + "64": 889378304.0, + "65": 889378304.0, + "66": 889378304.0, + "67": 889378304.0, + "68": 889378304.0, + "69": 889378304.0, + "70": 889378304.0, + "71": 889378304.0, + "72": 889378304.0, + "73": 889378304.0, + "74": 889378304.0, + "75": 889378304.0, + "76": 889378304.0, + "77": 889378304.0, + "78": 889378304.0, + "79": 889378304.0, + "80": 889378304.0, + "81": 889378304.0, + "82": 889378304.0, + "83": 889378304.0, + "84": 889378304.0, + "85": 889378304.0, + "86": 889378304.0, + "87": 889378304.0, + "88": 889378304.0, + "89": 889378304.0, + "90": 889378816.0, + "91": 889378816.0, + "92": 889378816.0, + "93": 889378816.0, + "94": 889378816.0, + "95": 889378816.0, + "96": 889378816.0, + "97": 889378816.0, + "98": 889378816.0, + "99": 889378816.0, + "100": 889378816.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 3.77319, + "53": 0.42363, + "54": 0.41071, + "55": 0.41011, + "56": 0.40905, + "57": 0.40957, + "58": 0.41032, + "59": 0.40997, + "60": 0.4109, + "61": 0.4104, + "62": 0.40989, + "63": 0.40974, + "64": 0.40928, + "65": 0.40668, + "66": 0.4076, + "67": 0.41006, + "68": 0.41114, + "69": 0.40437, + "70": 0.40702, + "71": 0.4095, + "72": 0.41064, + "73": 0.40549, + "74": 0.40683, + "75": 0.4055, + "76": 0.40589, + "77": 0.40198, + "78": 0.40196, + "79": 0.40383, + "80": 0.40596, + "81": 0.40678, + "82": 0.40646, + "83": 0.40861, + "84": 0.40858, + "85": 0.40709, + "86": 0.40475, + "87": 0.41028, + "88": 0.40188, + "89": 0.40272, + "90": 0.4034, + "91": 0.40676, + "92": 0.40732, + "93": 0.40103, + "94": 0.40501, + "95": 0.4043, + "96": 0.40452, + "97": 0.40255, + "98": 0.40532, + "99": 0.40632, + "100": 0.4042 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json index c8639e2d542..04cd8d66a75 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json @@ -6,104 +6,104 @@ "values": { "1": 10.86836, "2": 10.88595, - "3": 10.86559, - "4": 10.86893, - "5": 10.87417, - "6": 10.89061, - "7": 10.87673, - "8": 10.8647, - "9": 10.88231, - "10": 10.84582, - "11": 10.87165, - "12": 10.87421, - "13": 10.88164, - "14": 10.88885, - "15": 10.83927, - "16": 10.825, + "3": 10.86558, + "4": 10.86895, + "5": 10.87421, + "6": 10.89062, + "7": 10.87675, + "8": 10.86479, + "9": 10.88234, + "10": 10.8458, + "11": 10.87162, + "12": 10.8742, + "13": 10.88161, + "14": 10.88884, + "15": 10.83933, + "16": 10.82498, "17": 10.80147, "18": 10.81236, - "19": 10.82153, - "20": 10.71933, + "19": 10.82152, + "20": 10.71934, "21": 10.6909, - "22": 10.57427, - "23": 10.71093, - "24": 10.59784, - "25": 10.5556, + "22": 10.57424, + "23": 10.71092, + "24": 10.5978, + "25": 10.55562, "26": 10.61523, - "27": 10.60454, - "28": 10.56483, - "29": 10.58475, - "30": 10.35945, - "31": 10.12153, - "32": 10.45236, - "33": 10.45724, - "34": 10.21987, - "35": 10.2644, - "36": 10.21038, - "37": 10.33961, - "38": 10.18012, - "39": 10.39589, + "27": 10.60449, + "28": 10.56485, + "29": 10.58474, + "30": 10.35951, + "31": 10.12154, + "32": 10.45235, + "33": 10.4572, + "34": 10.2199, + "35": 10.26443, + "36": 10.21037, + "37": 10.33956, + "38": 10.18016, + "39": 10.39593, "40": 10.0663, - "41": 10.14169, - "42": 10.2085, - "43": 9.83125, - "44": 9.94861, - "45": 9.82847, - "46": 9.80462, + "41": 10.14164, + "42": 10.20852, + "43": 9.8313, + "44": 9.94856, + "45": 9.82849, + "46": 9.80457, "47": 10.14229, - "48": 9.84463, - "49": 9.52194, - "50": 9.88607, - "51": 9.84982, - "52": 9.74429, + "48": 9.84462, + "49": 9.52191, + "50": 9.88601, + "51": 9.8498, + "52": 9.74427, "53": 10.05843, - "54": 9.95129, + "54": 9.95125, "55": 9.88343, - "56": 9.61329, + "56": 9.61327, "57": 9.46899, - "58": 9.82161, - "59": 9.57702, - "60": 9.49786, - "61": 9.69256, - "62": 9.98595, - "63": 9.37403, - "64": 9.76605, - "65": 8.94649, - "66": 9.70105, - "67": 9.36367, - "68": 9.78237, - "69": 9.79879, - "70": 9.73166, - "71": 9.62508, - "72": 9.58312, - "73": 9.48822, - "74": 8.92611, - "75": 9.40725, + "58": 9.82164, + "59": 9.57703, + "60": 9.49784, + "61": 9.69255, + "62": 9.98596, + "63": 9.37402, + "64": 9.76603, + "65": 8.94654, + "66": 9.70099, + "67": 9.36365, + "68": 9.78238, + "69": 9.7988, + "70": 9.73169, + "71": 9.62505, + "72": 9.58309, + "73": 9.4882, + "74": 8.92607, + "75": 9.40727, "76": 9.07708, - "77": 10.05858, - "78": 9.7221, - "79": 9.37662, - "80": 9.40273, - "81": 9.48209, - "82": 9.6995, - "83": 9.31351, + "77": 10.0586, + "78": 9.72209, + "79": 9.37663, + "80": 9.40272, + "81": 9.48207, + "82": 9.69954, + "83": 9.31354, "84": 9.4173, - "85": 9.61584, - "86": 9.07429, - "87": 9.59551, - "88": 9.75065, + "85": 9.61582, + "86": 9.07431, + "87": 9.59556, + "88": 9.75064, "89": 9.6004, - "90": 9.8221, - "91": 9.33876, - "92": 9.3578, - "93": 9.08672, - "94": 8.82958, + "90": 9.82205, + "91": 9.33874, + "92": 9.35779, + "93": 9.08668, + "94": 8.8296, "95": 9.52596, - "96": 9.52973, + "96": 9.52974, "97": 9.30335, "98": 9.67136, - "99": 8.89537, - "100": 9.40568 + "99": 8.89539, + "100": 9.40567 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 1535.0, - "2": 1566.0, - "3": 1736.0, - "4": 1667.0, - "5": 1821.0, - "6": 1743.0, - "7": 1753.0, - "8": 1683.0, - "9": 1801.0, - "10": 1363.0, - "11": 1688.0, - "12": 1722.0, - "13": 1831.0, - "14": 1630.0, - "15": 1842.0, - "16": 1763.0, - "17": 1822.0, - "18": 1543.0, - "19": 1709.0, - "20": 1618.0, - "21": 1878.0, - "22": 1591.0, - "23": 1932.0, - "24": 1597.0, - "25": 1549.0, - "26": 1621.0, - "27": 1732.0, - "28": 1921.0, - "29": 1931.0, - "30": 1880.0, - "31": 1483.0, - "32": 1832.0, - "33": 2077.0, - "34": 1814.0, - "35": 1908.0, - "36": 1856.0, - "37": 2378.0, - "38": 2057.0, - "39": 2342.0, - "40": 2151.0, - "41": 2265.0, - "42": 2146.0, - "43": 1897.0, - "44": 2097.0, - "45": 2059.0, - "46": 2303.0, - "47": 2451.0, - "48": 2255.0, - "49": 2310.0, - "50": 2472.0, - "51": 2560.0, - "52": 2622.0, - "53": 2835.0, - "54": 2696.0, - "55": 2322.0, - "56": 2793.0, - "57": 2247.0, - "58": 2951.0, - "59": 2850.0, - "60": 2515.0, - "61": 2874.0, - "62": 2686.0, - "63": 2448.0, - "64": 2936.0, - "65": 2670.0, - "66": 2814.0, - "67": 2782.0, - "68": 2808.0, - "69": 2901.0, - "70": 3044.0, - "71": 2876.0, - "72": 2508.0, - "73": 2893.0, - "74": 1974.0, - "75": 2488.0, - "76": 2881.0, - "77": 3104.0, - "78": 3241.0, - "79": 3196.0, - "80": 3322.0, - "81": 3594.0, - "82": 3215.0, - "83": 2643.0, - "84": 3180.0, - "85": 3159.0, - "86": 2619.0, - "87": 3774.0, - "88": 3025.0, - "89": 3322.0, - "90": 3043.0, - "91": 2830.0, - "92": 3015.0, - "93": 2758.0, - "94": 3190.0, - "95": 3172.0, - "96": 3453.0, - "97": 3176.0, - "98": 3590.0, - "99": 3059.0, - "100": 3290.0 + "1": 1580.0, + "2": 1610.0, + "3": 1625.0, + "4": 1685.0, + "5": 1825.0, + "6": 1771.0, + "7": 1831.0, + "8": 1645.0, + "9": 1814.0, + "10": 1387.0, + "11": 1742.0, + "12": 1649.0, + "13": 1757.0, + "14": 1705.0, + "15": 1827.0, + "16": 1765.0, + "17": 1835.0, + "18": 1602.0, + "19": 1814.0, + "20": 1735.0, + "21": 1895.0, + "22": 1594.0, + "23": 1902.0, + "24": 1633.0, + "25": 1574.0, + "26": 1681.0, + "27": 1676.0, + "28": 1961.0, + "29": 1851.0, + "30": 1863.0, + "31": 1499.0, + "32": 1896.0, + "33": 2118.0, + "34": 1725.0, + "35": 1879.0, + "36": 1880.0, + "37": 2347.0, + "38": 2044.0, + "39": 2283.0, + "40": 2155.0, + "41": 2224.0, + "42": 2169.0, + "43": 1958.0, + "44": 2050.0, + "45": 2130.0, + "46": 2346.0, + "47": 2418.0, + "48": 2243.0, + "49": 2161.0, + "50": 2479.0, + "51": 2480.0, + "52": 2545.0, + "53": 2875.0, + "54": 2652.0, + "55": 2384.0, + "56": 2742.0, + "57": 2201.0, + "58": 2755.0, + "59": 2954.0, + "60": 2367.0, + "61": 2889.0, + "62": 2721.0, + "63": 2438.0, + "64": 2928.0, + "65": 2567.0, + "66": 2751.0, + "67": 2802.0, + "68": 2714.0, + "69": 2884.0, + "70": 3124.0, + "71": 2813.0, + "72": 2504.0, + "73": 2852.0, + "74": 1975.0, + "75": 2429.0, + "76": 2850.0, + "77": 3008.0, + "78": 3110.0, + "79": 3114.0, + "80": 3284.0, + "81": 3574.0, + "82": 3207.0, + "83": 2530.0, + "84": 3169.0, + "85": 3150.0, + "86": 2588.0, + "87": 3845.0, + "88": 3094.0, + "89": 3389.0, + "90": 3077.0, + "91": 2872.0, + "92": 3012.0, + "93": 2685.0, + "94": 3279.0, + "95": 3231.0, + "96": 3422.0, + "97": 3154.0, + "98": 3498.0, + "99": 3043.0, + "100": 3361.0 } }, "mem-allocated-bytes": { @@ -218,106 +218,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 465601024.0, - "2": 465601024.0, - "3": 465601024.0, - "4": 465601024.0, - "5": 465601024.0, - "6": 465601024.0, - "7": 465601024.0, - "8": 465601024.0, - "9": 465601024.0, - "10": 465601024.0, - "11": 465601024.0, - "12": 465601024.0, - "13": 465601024.0, - "14": 465601024.0, - "15": 465601024.0, - "16": 465601024.0, - "17": 465601024.0, - "18": 465601024.0, - "19": 465601024.0, - "20": 465601024.0, - "21": 465601024.0, - "22": 465601024.0, - "23": 465601024.0, - "24": 465601024.0, - "25": 465601024.0, - "26": 465601024.0, - "27": 465601024.0, - "28": 465601024.0, - "29": 465601024.0, - "30": 465601024.0, - "31": 465601024.0, - "32": 465601024.0, - "33": 465601024.0, - "34": 465601024.0, - "35": 465601024.0, - "36": 465601024.0, - "37": 465601024.0, - "38": 465601024.0, - "39": 465601024.0, - "40": 465601024.0, - "41": 465601024.0, - "42": 465601024.0, - "43": 465601024.0, - "44": 465601024.0, - "45": 465601024.0, - "46": 465601024.0, - "47": 465601024.0, - "48": 465601024.0, - "49": 465601024.0, - "50": 465601024.0, - "51": 465601024.0, - "52": 465601024.0, - "53": 465601024.0, - "54": 465601024.0, - "55": 465601024.0, - "56": 465601024.0, - "57": 465601024.0, - "58": 465601024.0, - "59": 465601024.0, - "60": 465601024.0, - "61": 465601024.0, - "62": 465601024.0, - "63": 465601024.0, - "64": 465601024.0, - "65": 465601024.0, - "66": 465601024.0, - "67": 465601024.0, - "68": 465601024.0, - "69": 465601024.0, - "70": 465601024.0, - "71": 465601024.0, - "72": 465601024.0, - "73": 465601024.0, - "74": 465601024.0, - "75": 465601024.0, - "76": 465601024.0, - "77": 465601024.0, - "78": 465601024.0, - "79": 465601024.0, - "80": 465601024.0, - "81": 465601024.0, - "82": 465601024.0, - "83": 465601024.0, - "84": 465601024.0, - "85": 465601024.0, - "86": 465601024.0, - "87": 465601024.0, - "88": 465601024.0, - "89": 465601024.0, - "90": 465601024.0, - "91": 465601024.0, - "92": 465601024.0, - "93": 465601024.0, - "94": 465601024.0, - "95": 465601024.0, - "96": 465601024.0, - "97": 465601024.0, - "98": 465601024.0, - "99": 465601024.0, - "100": 465601024.0 + "1": 465207808.0, + "2": 466256384.0, + "3": 466256384.0, + "4": 466256384.0, + "5": 466256384.0, + "6": 466256384.0, + "7": 466256384.0, + "8": 466256384.0, + "9": 466256384.0, + "10": 466256384.0, + "11": 466256384.0, + "12": 466256384.0, + "13": 466256384.0, + "14": 466256384.0, + "15": 466256384.0, + "16": 466256384.0, + "17": 466256384.0, + "18": 466256384.0, + "19": 466256384.0, + "20": 466256384.0, + "21": 466256384.0, + "22": 466256384.0, + "23": 466256384.0, + "24": 466256384.0, + "25": 466256384.0, + "26": 466256384.0, + "27": 466256384.0, + "28": 466256384.0, + "29": 466256384.0, + "30": 466256384.0, + "31": 466256384.0, + "32": 466256384.0, + "33": 466256384.0, + "34": 466256384.0, + "35": 466256384.0, + "36": 466256384.0, + "37": 466256384.0, + "38": 466256384.0, + "39": 466256384.0, + "40": 466256384.0, + "41": 466256384.0, + "42": 466256384.0, + "43": 466256384.0, + "44": 466256384.0, + "45": 466256384.0, + "46": 466256384.0, + "47": 466256384.0, + "48": 466256384.0, + "49": 466256384.0, + "50": 466256384.0, + "51": 466256384.0, + "52": 466256384.0, + "53": 466256384.0, + "54": 466256384.0, + "55": 466256384.0, + "56": 466256384.0, + "57": 466256384.0, + "58": 466256384.0, + "59": 466256384.0, + "60": 466256384.0, + "61": 466256384.0, + "62": 466256384.0, + "63": 466256384.0, + "64": 466256384.0, + "65": 466256384.0, + "66": 466256384.0, + "67": 466256384.0, + "68": 466256384.0, + "69": 466256384.0, + "70": 466256384.0, + "71": 466256384.0, + "72": 466256384.0, + "73": 466256384.0, + "74": 466256384.0, + "75": 466256384.0, + "76": 466256384.0, + "77": 466256384.0, + "78": 466256384.0, + "79": 466256384.0, + "80": 466256384.0, + "81": 466256384.0, + "82": 466256384.0, + "83": 466256384.0, + "84": 466256384.0, + "85": 466256384.0, + "86": 466256384.0, + "87": 466256384.0, + "88": 466256384.0, + "89": 466256384.0, + "90": 466256384.0, + "91": 466256384.0, + "92": 466256384.0, + "93": 466256384.0, + "94": 466256384.0, + "95": 466256384.0, + "96": 466256384.0, + "97": 466256384.0, + "98": 466256384.0, + "99": 466256384.0, + "100": 466256384.0 } }, "mem-max-allocated-bytes": { @@ -326,105 +326,105 @@ "step_interval": 1, "values": { "1": 1728999424.0, - "2": 1789405696.0, - "3": 1789405696.0, - "4": 1789405696.0, - "5": 1789405696.0, - "6": 1789405696.0, - "7": 1789405696.0, - "8": 1789405696.0, - "9": 1789405696.0, - "10": 1789405696.0, - "11": 1789405696.0, - "12": 1789405696.0, - "13": 1789405696.0, - "14": 1789405696.0, - "15": 1789405696.0, - "16": 1789405696.0, - "17": 1789405696.0, - "18": 1789405696.0, - "19": 1789405696.0, - "20": 1789405696.0, - "21": 1789405696.0, - "22": 1789405696.0, - "23": 1789405696.0, - "24": 1789405696.0, - "25": 1789405696.0, - "26": 1789405696.0, - "27": 1789405696.0, - "28": 1789405696.0, - "29": 1789405696.0, - "30": 1789405696.0, - "31": 1789405696.0, - "32": 1789405696.0, - "33": 1789405696.0, - "34": 1789405696.0, - "35": 1789405696.0, - "36": 1789405696.0, - "37": 1789405696.0, - "38": 1789405696.0, - "39": 1789405696.0, - "40": 1789405696.0, - "41": 1789405696.0, - "42": 1789405696.0, - "43": 1789405696.0, - "44": 1789405696.0, - "45": 1789405696.0, - "46": 1789405696.0, - "47": 1789405696.0, - "48": 1789405696.0, - "49": 1789405696.0, - "50": 1789405696.0, - "51": 1789405696.0, - "52": 1789405696.0, - "53": 1789405696.0, - "54": 1789405696.0, - "55": 1789405696.0, - "56": 1789405696.0, - "57": 1789405696.0, - "58": 1789405696.0, - "59": 1789405696.0, - "60": 1789405696.0, - "61": 1789405696.0, - "62": 1789405696.0, - "63": 1789405696.0, - "64": 1789405696.0, - "65": 1789405696.0, - "66": 1789405696.0, - "67": 1789405696.0, - "68": 1789405696.0, - "69": 1789405696.0, - "70": 1789405696.0, - "71": 1789405696.0, - "72": 1789405696.0, - "73": 1789405696.0, - "74": 1789405696.0, - "75": 1789405696.0, - "76": 1789405696.0, - "77": 1789405696.0, - "78": 1789405696.0, - "79": 1789405696.0, - "80": 1789405696.0, - "81": 1789405696.0, - "82": 1789405696.0, - "83": 1789405696.0, - "84": 1789405696.0, - "85": 1789405696.0, - "86": 1789405696.0, - "87": 1789405696.0, - "88": 1789405696.0, - "89": 1789405696.0, - "90": 1789405696.0, - "91": 1789405696.0, - "92": 1789405696.0, - "93": 1789405696.0, - "94": 1789405696.0, - "95": 1789405696.0, - "96": 1789405696.0, - "97": 1789405696.0, - "98": 1789405696.0, - "99": 1789405696.0, - "100": 1789405696.0 + "2": 1789536768.0, + "3": 1789536768.0, + "4": 1789536768.0, + "5": 1789536768.0, + "6": 1789536768.0, + "7": 1789536768.0, + "8": 1789536768.0, + "9": 1789536768.0, + "10": 1789536768.0, + "11": 1789536768.0, + "12": 1789536768.0, + "13": 1789536768.0, + "14": 1789536768.0, + "15": 1789536768.0, + "16": 1789536768.0, + "17": 1789536768.0, + "18": 1789536768.0, + "19": 1789536768.0, + "20": 1789536768.0, + "21": 1789536768.0, + "22": 1789536768.0, + "23": 1789536768.0, + "24": 1789536768.0, + "25": 1789536768.0, + "26": 1789536768.0, + "27": 1789536768.0, + "28": 1789536768.0, + "29": 1789536768.0, + "30": 1789536768.0, + "31": 1789536768.0, + "32": 1789536768.0, + "33": 1789536768.0, + "34": 1789536768.0, + "35": 1789536768.0, + "36": 1789536768.0, + "37": 1789536768.0, + "38": 1789536768.0, + "39": 1789536768.0, + "40": 1789536768.0, + "41": 1789536768.0, + "42": 1789536768.0, + "43": 1789536768.0, + "44": 1789536768.0, + "45": 1789536768.0, + "46": 1789536768.0, + "47": 1789536768.0, + "48": 1789536768.0, + "49": 1789536768.0, + "50": 1789536768.0, + "51": 1789536768.0, + "52": 1789536768.0, + "53": 1789536768.0, + "54": 1789536768.0, + "55": 1789536768.0, + "56": 1789536768.0, + "57": 1789536768.0, + "58": 1789536768.0, + "59": 1789536768.0, + "60": 1789536768.0, + "61": 1789536768.0, + "62": 1789536768.0, + "63": 1789536768.0, + "64": 1789536768.0, + "65": 1789536768.0, + "66": 1789536768.0, + "67": 1789536768.0, + "68": 1789536768.0, + "69": 1789536768.0, + "70": 1789536768.0, + "71": 1789536768.0, + "72": 1789536768.0, + "73": 1789536768.0, + "74": 1789536768.0, + "75": 1789536768.0, + "76": 1789536768.0, + "77": 1789536768.0, + "78": 1789536768.0, + "79": 1789536768.0, + "80": 1789536768.0, + "81": 1789536768.0, + "82": 1789536768.0, + "83": 1789536768.0, + "84": 1789536768.0, + "85": 1789536768.0, + "86": 1789536768.0, + "87": 1789536768.0, + "88": 1789536768.0, + "89": 1789536768.0, + "90": 1789536768.0, + "91": 1789536768.0, + "92": 1789536768.0, + "93": 1789536768.0, + "94": 1789536768.0, + "95": 1789536768.0, + "96": 1789536768.0, + "97": 1789536768.0, + "98": 1789536768.0, + "99": 1789536768.0, + "100": 1789536768.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 10.87419, - "2": 0.53001, - "3": 0.2186, - "4": 0.25482, - "5": 0.20138, - "6": 0.19379, - "7": 0.79335, - "8": 0.34845, - "9": 0.55178, - "10": 0.41213, - "11": 0.70514, - "12": 0.42183, - "13": 0.79058, - "14": 0.25823, - "15": 0.17847, - "16": 0.17856, - "17": 0.22517, - "18": 0.17747, - "19": 0.2016, - "20": 0.17788, - "21": 0.2366, - "22": 0.17719, - "23": 0.17889, - "24": 0.17909, - "25": 0.23071, - "26": 0.18878, - "27": 0.17959, - "28": 0.17796, - "29": 0.19707, - "30": 0.17868, - "31": 0.23748, - "32": 0.17977, - "33": 0.1776, - "34": 0.17788, - "35": 0.17714, - "36": 0.17848, - "37": 0.17912, - "38": 0.17729, - "39": 0.20194, - "40": 0.5561, - "41": 0.18404, - "42": 0.21996, - "43": 0.1805, - "44": 0.22997, - "45": 0.17843, - "46": 0.17815, - "47": 0.17755, - "48": 0.21932, - "49": 0.17935, - "50": 0.21536, - "51": 0.18927, - "52": 0.17358, - "53": 0.17366, - "54": 0.19577, - "55": 0.17508, - "56": 0.20037, - "57": 0.17429, - "58": 0.2159, - "59": 0.17615, - "60": 0.17613, - "61": 0.17677, - "62": 0.17726, - "63": 0.22918, - "64": 0.17848, - "65": 0.17926, - "66": 0.17835, - "67": 0.17818, - "68": 0.17977, - "69": 0.17935, - "70": 0.17953, - "71": 0.17922, - "72": 0.17845, - "73": 0.19928, - "74": 0.17885, - "75": 0.20547, - "76": 0.2325, - "77": 0.18027, - "78": 0.17887, - "79": 0.18129, - "80": 0.18884, - "81": 0.1894, - "82": 0.18987, - "83": 0.19315, - "84": 0.19155, - "85": 0.19434, - "86": 0.19122, - "87": 0.1931, - "88": 0.19294, - "89": 0.2106, - "90": 0.19136, - "91": 0.19388, - "92": 0.21142, - "93": 0.19188, - "94": 0.19177, - "95": 0.19125, - "96": 0.1943, - "97": 0.20398, - "98": 0.19536, - "99": 0.19149, - "100": 0.19184 + "1": "nan", + "2": 2.69056, + "3": 0.17999, + "4": 0.16451, + "5": 0.16508, + "6": 0.17235, + "7": 0.16329, + "8": 0.1626, + "9": 0.16188, + "10": 0.16733, + "11": 0.16471, + "12": 0.16323, + "13": 0.16176, + "14": 0.16306, + "15": 0.16415, + "16": 0.16286, + "17": 0.16013, + "18": 0.16147, + "19": 0.17142, + "20": 0.1614, + "21": 0.16056, + "22": 0.16073, + "23": 0.1704, + "24": 0.16109, + "25": 0.16097, + "26": 0.16623, + "27": 0.15978, + "28": 0.17015, + "29": 0.17103, + "30": 0.18177, + "31": 0.18267, + "32": 0.18537, + "33": 0.18546, + "34": 0.18686, + "35": 0.18715, + "36": 0.18598, + "37": 0.18556, + "38": 0.18847, + "39": 0.187, + "40": 0.18548, + "41": 0.19477, + "42": 0.18691, + "43": 0.18628, + "44": 0.18945, + "45": 0.18687, + "46": 0.18766, + "47": 0.18828, + "48": 0.1885, + "49": 0.18744, + "50": 0.18918, + "51": 0.20273, + "52": 0.182, + "53": 0.18, + "54": 0.17575, + "55": 0.17407, + "56": 0.17222, + "57": 0.16988, + "58": 0.17015, + "59": 0.17038, + "60": 0.16865, + "61": 0.16894, + "62": 0.16852, + "63": 0.16574, + "64": 0.16829, + "65": 0.16644, + "66": 0.16896, + "67": 0.16934, + "68": 0.1675, + "69": 0.16535, + "70": 0.16738, + "71": 0.17159, + "72": 0.18394, + "73": 0.18193, + "74": 0.18302, + "75": 0.1832, + "76": 0.18125, + "77": 0.17794, + "78": 0.17778, + "79": 0.17611, + "80": 0.17384, + "81": 0.17173, + "82": 0.16989, + "83": 0.16782, + "84": 0.16781, + "85": 0.16901, + "86": 0.16737, + "87": 0.16701, + "88": 0.16719, + "89": 0.16644, + "90": 0.16551, + "91": 0.16712, + "92": 0.16502, + "93": 0.16672, + "94": 0.1665, + "95": 0.1653, + "96": 0.16686, + "97": 0.16586, + "98": 0.16635, + "99": 0.1655, + "100": 0.16563 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..558ad752f07 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.8498, + "52": 9.74428, + "53": 10.05842, + "54": 9.95129, + "55": 9.88341, + "56": 9.61325, + "57": 9.46897, + "58": 9.82163, + "59": 9.57702, + "60": 9.49787, + "61": 9.69255, + "62": 9.98598, + "63": 9.37403, + "64": 9.76601, + "65": 8.94652, + "66": 9.70103, + "67": 9.36369, + "68": 9.7824, + "69": 9.79882, + "70": 9.73168, + "71": 9.6251, + "72": 9.58313, + "73": 9.4882, + "74": 8.92611, + "75": 9.40723, + "76": 9.07704, + "77": 10.05859, + "78": 9.72209, + "79": 9.37661, + "80": 9.40273, + "81": 9.48205, + "82": 9.69955, + "83": 9.31352, + "84": 9.41732, + "85": 9.61583, + "86": 9.07429, + "87": 9.59556, + "88": 9.75065, + "89": 9.60041, + "90": 9.82204, + "91": 9.33875, + "92": 9.35776, + "93": 9.08668, + "94": 8.82962, + "95": 9.52594, + "96": 9.52969, + "97": 9.30331, + "98": 9.67138, + "99": 8.89538, + "100": 9.40569 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2514.0, + "52": 2584.0, + "53": 3025.0, + "54": 2653.0, + "55": 2268.0, + "56": 2637.0, + "57": 2225.0, + "58": 2904.0, + "59": 2970.0, + "60": 2399.0, + "61": 2925.0, + "62": 2639.0, + "63": 2383.0, + "64": 2889.0, + "65": 2675.0, + "66": 2992.0, + "67": 2764.0, + "68": 2725.0, + "69": 2865.0, + "70": 3077.0, + "71": 2923.0, + "72": 2414.0, + "73": 2906.0, + "74": 1947.0, + "75": 2449.0, + "76": 2976.0, + "77": 3163.0, + "78": 3186.0, + "79": 3172.0, + "80": 3344.0, + "81": 3625.0, + "82": 3289.0, + "83": 2699.0, + "84": 3102.0, + "85": 3227.0, + "86": 2754.0, + "87": 3714.0, + "88": 3004.0, + "89": 3321.0, + "90": 3134.0, + "91": 2714.0, + "92": 3077.0, + "93": 2631.0, + "94": 3309.0, + "95": 3226.0, + "96": 3473.0, + "97": 3216.0, + "98": 3581.0, + "99": 3061.0, + "100": 3419.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 465864192.0, + "52": 465864192.0, + "53": 465864192.0, + "54": 465864192.0, + "55": 465864192.0, + "56": 465864192.0, + "57": 465864192.0, + "58": 465864192.0, + "59": 465864192.0, + "60": 465864192.0, + "61": 465864192.0, + "62": 465864192.0, + "63": 465864192.0, + "64": 465864192.0, + "65": 465864192.0, + "66": 465864192.0, + "67": 465864192.0, + "68": 465864192.0, + "69": 465864192.0, + "70": 465864192.0, + "71": 465864192.0, + "72": 465864192.0, + "73": 465864192.0, + "74": 465864192.0, + "75": 465864192.0, + "76": 465864192.0, + "77": 465864192.0, + "78": 465864192.0, + "79": 465864192.0, + "80": 465864192.0, + "81": 465864192.0, + "82": 465864192.0, + "83": 465864192.0, + "84": 465864192.0, + "85": 465864192.0, + "86": 465864192.0, + "87": 465864192.0, + "88": 465864192.0, + "89": 465864192.0, + "90": 465864192.0, + "91": 465864192.0, + "92": 465864192.0, + "93": 465864192.0, + "94": 465864192.0, + "95": 465864192.0, + "96": 465864192.0, + "97": 465864192.0, + "98": 465864192.0, + "99": 465864192.0, + "100": 465864192.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1911693312.0, + "52": 1911693312.0, + "53": 1911693312.0, + "54": 1911693312.0, + "55": 1911693312.0, + "56": 1911693312.0, + "57": 1911693312.0, + "58": 1911693312.0, + "59": 1911693312.0, + "60": 1911693312.0, + "61": 1911693312.0, + "62": 1911693312.0, + "63": 1911693312.0, + "64": 1911693312.0, + "65": 1911693312.0, + "66": 1911693312.0, + "67": 1911693312.0, + "68": 1911693312.0, + "69": 1911693312.0, + "70": 1911693312.0, + "71": 1911693312.0, + "72": 1911693312.0, + "73": 1911693312.0, + "74": 1911693312.0, + "75": 1911693312.0, + "76": 1911693312.0, + "77": 1911693312.0, + "78": 1911693312.0, + "79": 1911693312.0, + "80": 1911693312.0, + "81": 1911693312.0, + "82": 1911693312.0, + "83": 1911693312.0, + "84": 1911693312.0, + "85": 1911693312.0, + "86": 1911693312.0, + "87": 1911693312.0, + "88": 1911693312.0, + "89": 1911693312.0, + "90": 1911693312.0, + "91": 1911693312.0, + "92": 1911693312.0, + "93": 1911693312.0, + "94": 1911693312.0, + "95": 1911693312.0, + "96": 1911693312.0, + "97": 1911693312.0, + "98": 1911693312.0, + "99": 1911693312.0, + "100": 1911693312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.81421, + "53": 0.16853, + "54": 0.15862, + "55": 0.15827, + "56": 0.15923, + "57": 0.15884, + "58": 0.15972, + "59": 0.15955, + "60": 0.16137, + "61": 0.15962, + "62": 0.16098, + "63": 0.15948, + "64": 0.15791, + "65": 0.15969, + "66": 0.15933, + "67": 0.17128, + "68": 0.15958, + "69": 0.16526, + "70": 0.15854, + "71": 0.16076, + "72": 0.15949, + "73": 0.1598, + "74": 0.15944, + "75": 0.15956, + "76": 0.1605, + "77": 0.15954, + "78": 0.15934, + "79": 0.16153, + "80": 0.16883, + "81": 0.16008, + "82": 0.16051, + "83": 0.16043, + "84": 0.16049, + "85": 0.16138, + "86": 0.16025, + "87": 0.16089, + "88": 0.15937, + "89": 0.16098, + "90": 0.16047, + "91": 0.16142, + "92": 0.1613, + "93": 0.16027, + "94": 0.16427, + "95": 0.16157, + "96": 0.16144, + "97": 0.16147, + "98": 0.16068, + "99": 0.16024, + "100": 0.15949 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json index a321d71dac5..27eb21de0f0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 7.46014, - "2": 0.22036, - "3": 0.24678, - "4": 0.1906, - "5": 0.23432, - "6": 0.19337, - "7": 0.20952, - "8": 0.20857, - "9": 0.20711, - "10": 0.21582, - "11": 0.20302, - "12": 0.23361, - "13": 0.21601, - "14": 0.19637, - "15": 0.19187, - "16": 0.19595, - "17": 0.19262, - "18": 0.25658, - "19": 0.19382, - "20": 0.23562, - "21": 0.19141, - "22": 0.19045, - "23": 0.25041, - "24": 0.19507, - "25": 0.19119, - "26": 0.25125, - "27": 0.24158, - "28": 0.19174, - "29": 0.19271, - "30": 0.19107, - "31": 0.20992, - "32": 0.19656, - "33": 0.22065, - "34": 0.24506, - "35": 0.26305, - "36": 0.19488, - "37": 0.21539, - "38": 0.19008, - "39": 0.45338, - "40": 0.19345, - "41": 0.19327, - "42": 0.19025, - "43": 0.2339, - "44": 0.19531, - "45": 0.19303, - "46": 0.22612, - "47": 0.19173, - "48": 0.22577, - "49": 0.19067, - "50": 0.23575, - "51": 0.24917, - "52": 0.22723, - "53": 0.22561, - "54": 0.22604, - "55": 0.22405, - "56": 0.22789, - "57": 0.22456, - "58": 0.23947, - "59": 0.24294, - "60": 0.22777, - "61": 0.22508, - "62": 0.2306, - "63": 0.23205, - "64": 0.23143, - "65": 0.23321, - "66": 0.23216, - "67": 0.23316, - "68": 0.23149, - "69": 0.23283, - "70": 0.22854, - "71": 0.24333, - "72": 0.23197, - "73": 0.22937, - "74": 0.23068, - "75": 0.2279, - "76": 0.22968, - "77": 0.25609, - "78": 0.25409, - "79": 0.25184, - "80": 0.22949, - "81": 0.22763, - "82": 0.22592, - "83": 0.22813, - "84": 0.22963, - "85": 0.23411, - "86": 0.22821, - "87": 0.23117, - "88": 0.23326, - "89": 0.22984, - "90": 0.22828, - "91": 0.23148, - "92": 0.23378, - "93": 0.23729, - "94": 0.23173, - "95": 0.23146, - "96": 0.23193, - "97": 0.23076, - "98": 0.33615, - "99": 0.23042, - "100": 0.25353 + "1": "nan", + "2": 2.6815, + "3": 0.23582, + "4": 0.21969, + "5": 0.22399, + "6": 0.21848, + "7": 0.21944, + "8": 0.21989, + "9": 0.22542, + "10": 0.22685, + "11": 0.22859, + "12": 0.22734, + "13": 0.22735, + "14": 0.22682, + "15": 0.22731, + "16": 0.22724, + "17": 0.22774, + "18": 0.2253, + "19": 0.21338, + "20": 0.21612, + "21": 0.22487, + "22": 0.2609, + "23": 0.34495, + "24": 0.40538, + "25": 0.27265, + "26": 0.22852, + "27": 0.23498, + "28": 0.23458, + "29": 0.2356, + "30": 0.23223, + "31": 0.23427, + "32": 0.23193, + "33": 0.23007, + "34": 0.22762, + "35": 0.22604, + "36": 0.22153, + "37": 0.21923, + "38": 0.21718, + "39": 0.2162, + "40": 0.21653, + "41": 0.21673, + "42": 0.21416, + "43": 0.21439, + "44": 0.2141, + "45": 0.21364, + "46": 0.21263, + "47": 0.2139, + "48": 0.21445, + "49": 0.21424, + "50": 0.21381, + "51": 0.21544, + "52": 0.21075, + "53": 0.21292, + "54": 0.21407, + "55": 0.2167, + "56": 0.21877, + "57": 0.21861, + "58": 0.22087, + "59": 0.21999, + "60": 0.21884, + "61": 0.21841, + "62": 0.21988, + "63": 0.21876, + "64": 0.21811, + "65": 0.21795, + "66": 0.2197, + "67": 0.22005, + "68": 0.21994, + "69": 0.21937, + "70": 0.21964, + "71": 0.22007, + "72": 0.221, + "73": 0.22145, + "74": 0.22069, + "75": 0.22126, + "76": 0.21984, + "77": 0.22096, + "78": 0.2231, + "79": 0.22168, + "80": 0.21932, + "81": 0.21748, + "82": 0.21971, + "83": 0.22113, + "84": 0.22096, + "85": 0.22316, + "86": 0.22043, + "87": 0.22198, + "88": 0.2247, + "89": 0.2219, + "90": 0.22258, + "91": 0.22224, + "92": 0.22132, + "93": 0.22182, + "94": 0.22397, + "95": 0.22547, + "96": 0.22177, + "97": 0.22282, + "98": 0.22255, + "99": 0.22417, + "100": 0.22334 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..0889d8315f2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.84743, + "52": 9.74209, + "53": 10.05697, + "54": 9.9505, + "55": 9.88145, + "56": 9.61274, + "57": 9.4687, + "58": 9.82193, + "59": 9.57642, + "60": 9.49762, + "61": 9.69189, + "62": 9.9867, + "63": 9.37512, + "64": 9.76679, + "65": 8.94648, + "66": 9.7023, + "67": 9.36326, + "68": 9.7831, + "69": 9.7986, + "70": 9.7317, + "71": 9.62571, + "72": 9.58488, + "73": 9.48967, + "74": 8.9286, + "75": 9.40862, + "76": 9.07925, + "77": 10.0594, + "78": 9.72288, + "79": 9.37784, + "80": 9.40429, + "81": 9.48309, + "82": 9.7004, + "83": 9.31595, + "84": 9.41838, + "85": 9.61685, + "86": 9.07533, + "87": 9.59616, + "88": 9.75215, + "89": 9.60184, + "90": 9.82281, + "91": 9.34037, + "92": 9.35854, + "93": 9.08805, + "94": 8.83037, + "95": 9.5266, + "96": 9.53049, + "97": 9.30389, + "98": 9.67196, + "99": 8.89637, + "100": 9.40644 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2491.0, + "52": 2552.0, + "53": 2980.0, + "54": 2680.0, + "55": 2274.0, + "56": 2734.0, + "57": 2319.0, + "58": 2907.0, + "59": 2886.0, + "60": 2566.0, + "61": 2855.0, + "62": 2704.0, + "63": 2370.0, + "64": 2998.0, + "65": 2563.0, + "66": 2868.0, + "67": 2762.0, + "68": 2739.0, + "69": 2730.0, + "70": 3156.0, + "71": 2803.0, + "72": 2506.0, + "73": 2896.0, + "74": 1937.0, + "75": 2450.0, + "76": 2794.0, + "77": 3047.0, + "78": 3104.0, + "79": 3069.0, + "80": 3286.0, + "81": 3543.0, + "82": 3192.0, + "83": 2614.0, + "84": 3273.0, + "85": 3111.0, + "86": 2680.0, + "87": 3654.0, + "88": 3117.0, + "89": 3351.0, + "90": 3086.0, + "91": 2721.0, + "92": 3045.0, + "93": 2672.0, + "94": 3326.0, + "95": 3125.0, + "96": 3309.0, + "97": 3208.0, + "98": 3572.0, + "99": 2980.0, + "100": 3355.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 515933696.0, + "52": 515933696.0, + "53": 515933696.0, + "54": 515933696.0, + "55": 515933696.0, + "56": 515933696.0, + "57": 515933696.0, + "58": 515933696.0, + "59": 515933696.0, + "60": 515933696.0, + "61": 515933696.0, + "62": 515933696.0, + "63": 515933696.0, + "64": 515933696.0, + "65": 515933696.0, + "66": 515933696.0, + "67": 515933696.0, + "68": 515933696.0, + "69": 515933696.0, + "70": 515933696.0, + "71": 515933696.0, + "72": 515933696.0, + "73": 515933696.0, + "74": 515933696.0, + "75": 515933696.0, + "76": 515933696.0, + "77": 515933696.0, + "78": 515933696.0, + "79": 515933696.0, + "80": 515933696.0, + "81": 515933696.0, + "82": 515933696.0, + "83": 515933696.0, + "84": 515933696.0, + "85": 515933696.0, + "86": 515933696.0, + "87": 515933696.0, + "88": 515933696.0, + "89": 515933696.0, + "90": 515933696.0, + "91": 515933696.0, + "92": 515933696.0, + "93": 515933696.0, + "94": 515933696.0, + "95": 515933696.0, + "96": 515933696.0, + "97": 515933696.0, + "98": 515933696.0, + "99": 515933696.0, + "100": 515933696.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1440754176.0, + "52": 1440755200.0, + "53": 1440755200.0, + "54": 1440755200.0, + "55": 1440755200.0, + "56": 1440755200.0, + "57": 1440755200.0, + "58": 1440755200.0, + "59": 1440755200.0, + "60": 1440755200.0, + "61": 1440755200.0, + "62": 1440755200.0, + "63": 1440755200.0, + "64": 1440755200.0, + "65": 1440755200.0, + "66": 1440755200.0, + "67": 1440755200.0, + "68": 1440755200.0, + "69": 1440755200.0, + "70": 1440755200.0, + "71": 1440755200.0, + "72": 1440755200.0, + "73": 1440755200.0, + "74": 1440755200.0, + "75": 1440755200.0, + "76": 1440755200.0, + "77": 1440755200.0, + "78": 1440755200.0, + "79": 1440755200.0, + "80": 1440755200.0, + "81": 1440755200.0, + "82": 1440755200.0, + "83": 1440755200.0, + "84": 1440755200.0, + "85": 1440755200.0, + "86": 1440755200.0, + "87": 1440755200.0, + "88": 1440755200.0, + "89": 1440755200.0, + "90": 1440755200.0, + "91": 1440755200.0, + "92": 1440755200.0, + "93": 1440755200.0, + "94": 1440755200.0, + "95": 1440755200.0, + "96": 1440755200.0, + "97": 1440755200.0, + "98": 1440755200.0, + "99": 1440755200.0, + "100": 1440755200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.76016, + "53": 0.23476, + "54": 0.223, + "55": 0.22429, + "56": 0.21901, + "57": 0.22278, + "58": 0.22163, + "59": 0.22214, + "60": 0.22201, + "61": 0.2247, + "62": 0.22616, + "63": 0.22396, + "64": 0.23047, + "65": 0.23737, + "66": 0.24455, + "67": 0.23295, + "68": 0.22857, + "69": 0.22662, + "70": 0.22814, + "71": 0.2322, + "72": 0.233, + "73": 0.22777, + "74": 0.22898, + "75": 0.23307, + "76": 0.23163, + "77": 0.23205, + "78": 0.23196, + "79": 0.2324, + "80": 0.23104, + "81": 0.23192, + "82": 0.23206, + "83": 0.22902, + "84": 0.23961, + "85": 0.24378, + "86": 0.24255, + "87": 0.24283, + "88": 0.24429, + "89": 0.24795, + "90": 0.2492, + "91": 0.2493, + "92": 0.24516, + "93": 0.24543, + "94": 0.23595, + "95": 0.23484, + "96": 0.23416, + "97": 0.24493, + "98": 0.24676, + "99": 0.24195, + "100": 0.2459 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json index 49fb0cee006..ff9932eb6ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 6.0946, - "2": 0.23434, - "3": 0.25974, - "4": 0.19572, - "5": 0.19385, - "6": 0.23205, - "7": 0.7704, - "8": 0.19849, - "9": 0.1899, - "10": 0.19145, - "11": 0.22929, - "12": 0.19296, - "13": 0.18887, - "14": 0.18975, - "15": 0.19107, - "16": 0.18736, - "17": 0.18574, - "18": 0.22677, - "19": 0.19016, - "20": 0.20891, - "21": 0.18795, - "22": 0.18702, - "23": 0.18879, - "24": 0.23626, - "25": 0.18708, - "26": 0.21783, - "27": 0.3498, - "28": 0.18687, - "29": 0.20508, - "30": 0.1874, - "31": 0.27079, - "32": 0.19016, - "33": 0.18984, - "34": 0.18963, - "35": 0.25952, - "36": 0.21489, - "37": 0.20358, - "38": 0.20254, - "39": 0.2039, - "40": 0.20108, - "41": 0.18536, - "42": 0.18627, - "43": 0.22134, - "44": 0.19018, - "45": 0.18634, - "46": 0.18446, - "47": 0.19975, - "48": 0.18759, - "49": 0.18704, - "50": 0.18617, - "51": 0.20108, - "52": 0.18371, - "53": 0.18371, - "54": 0.18409, - "55": 0.18492, - "56": 0.18608, - "57": 0.33035, - "58": 0.18444, - "59": 0.18479, - "60": 0.2007, - "61": 0.18737, - "62": 0.54423, - "63": 0.18739, - "64": 0.18756, - "65": 0.22855, - "66": 0.1889, - "67": 0.18728, - "68": 0.18737, - "69": 0.1863, - "70": 0.18731, - "71": 0.22911, - "72": 0.18493, - "73": 0.1846, - "74": 0.1919, - "75": 0.21803, - "76": 0.36578, - "77": 0.22572, - "78": 0.20057, - "79": 0.18852, - "80": 0.53951, - "81": 0.42214, - "82": 0.18567, - "83": 0.18702, - "84": 0.1856, - "85": 0.18727, - "86": 0.18505, - "87": 0.18506, - "88": 0.22119, - "89": 0.22551, - "90": 0.18825, - "91": 0.18812, - "92": 0.18805, - "93": 0.18696, - "94": 0.18716, - "95": 0.18779, - "96": 0.41477, - "97": 0.18674, - "98": 0.20738, - "99": 0.18625, - "100": 0.21802 + "1": "nan", + "2": 2.61029, + "3": 0.19483, + "4": 0.18062, + "5": 0.18259, + "6": 0.18221, + "7": 0.18364, + "8": 0.18219, + "9": 0.18127, + "10": 0.18188, + "11": 0.18181, + "12": 0.1831, + "13": 0.18466, + "14": 0.18253, + "15": 0.18081, + "16": 0.18182, + "17": 0.18252, + "18": 0.18383, + "19": 0.18271, + "20": 0.18292, + "21": 0.18157, + "22": 0.18322, + "23": 0.18379, + "24": 0.18403, + "25": 0.18149, + "26": 0.18219, + "27": 0.18279, + "28": 0.18152, + "29": 0.18195, + "30": 0.18138, + "31": 0.18264, + "32": 0.18261, + "33": 0.18358, + "34": 0.18161, + "35": 0.18269, + "36": 0.18132, + "37": 0.18252, + "38": 0.18362, + "39": 0.18411, + "40": 0.18423, + "41": 0.1835, + "42": 0.18364, + "43": 0.18373, + "44": 0.18306, + "45": 0.18336, + "46": 0.18278, + "47": 0.18327, + "48": 0.18332, + "49": 0.18271, + "50": 0.18318, + "51": 0.19806, + "52": 0.1799, + "53": 0.18156, + "54": 0.1807, + "55": 0.17954, + "56": 0.18001, + "57": 0.18039, + "58": 0.181, + "59": 0.18041, + "60": 0.17989, + "61": 0.18137, + "62": 0.18121, + "63": 0.18088, + "64": 0.1801, + "65": 0.18077, + "66": 0.18006, + "67": 0.18112, + "68": 0.18089, + "69": 0.18124, + "70": 0.17966, + "71": 0.18084, + "72": 0.18137, + "73": 0.18132, + "74": 0.18078, + "75": 0.18129, + "76": 0.18079, + "77": 0.1816, + "78": 0.18161, + "79": 0.18227, + "80": 0.18173, + "81": 0.18145, + "82": 0.18143, + "83": 0.18128, + "84": 0.18207, + "85": 0.18121, + "86": 0.18062, + "87": 0.17981, + "88": 0.18098, + "89": 0.18014, + "90": 0.17967, + "91": 0.18153, + "92": 0.18175, + "93": 0.18107, + "94": 0.17803, + "95": 0.17796, + "96": 0.17757, + "97": 0.17815, + "98": 0.17979, + "99": 0.18056, + "100": 0.18044 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..5a291771fe9 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.8474, + "52": 9.74209, + "53": 10.05695, + "54": 9.95048, + "55": 9.88137, + "56": 9.61274, + "57": 9.46865, + "58": 9.82191, + "59": 9.57642, + "60": 9.49763, + "61": 9.6919, + "62": 9.98672, + "63": 9.37511, + "64": 9.76682, + "65": 8.94645, + "66": 9.70228, + "67": 9.36325, + "68": 9.78311, + "69": 9.79861, + "70": 9.73171, + "71": 9.62575, + "72": 9.58482, + "73": 9.48964, + "74": 8.92857, + "75": 9.40863, + "76": 9.07924, + "77": 10.05936, + "78": 9.72284, + "79": 9.37782, + "80": 9.40428, + "81": 9.48314, + "82": 9.70039, + "83": 9.31593, + "84": 9.41835, + "85": 9.61687, + "86": 9.07538, + "87": 9.59618, + "88": 9.75215, + "89": 9.60188, + "90": 9.82284, + "91": 9.34035, + "92": 9.35853, + "93": 9.08806, + "94": 8.83039, + "95": 9.5266, + "96": 9.53046, + "97": 9.30391, + "98": 9.67197, + "99": 8.89638, + "100": 9.40645 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 71.0, + "52": 53.0, + "53": 77.0, + "54": 92.0, + "55": 67.0, + "56": 92.0, + "57": 86.0, + "58": 79.0, + "59": 74.0, + "60": 70.0, + "61": 98.0, + "62": 71.0, + "63": 64.0, + "64": 83.0, + "65": 89.0, + "66": 86.0, + "67": 62.0, + "68": 67.0, + "69": 57.0, + "70": 90.0, + "71": 66.0, + "72": 61.0, + "73": 76.0, + "74": 52.0, + "75": 63.0, + "76": 78.0, + "77": 78.0, + "78": 87.0, + "79": 83.0, + "80": 77.0, + "81": 102.0, + "82": 74.0, + "83": 67.0, + "84": 68.0, + "85": 96.0, + "86": 89.0, + "87": 92.0, + "88": 81.0, + "89": 47.0, + "90": 76.0, + "91": 70.0, + "92": 82.0, + "93": 58.0, + "94": 76.0, + "95": 71.0, + "96": 92.0, + "97": 67.0, + "98": 88.0, + "99": 66.0, + "100": 69.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 544900608.0, + "52": 544900608.0, + "53": 544900608.0, + "54": 544900608.0, + "55": 544900608.0, + "56": 544900608.0, + "57": 544900608.0, + "58": 544900608.0, + "59": 544900608.0, + "60": 544900608.0, + "61": 544900608.0, + "62": 544900608.0, + "63": 544900608.0, + "64": 544900608.0, + "65": 544900608.0, + "66": 544900608.0, + "67": 544900608.0, + "68": 544900608.0, + "69": 544900608.0, + "70": 544900608.0, + "71": 544900608.0, + "72": 544900608.0, + "73": 544900608.0, + "74": 544900608.0, + "75": 544900608.0, + "76": 544900608.0, + "77": 544900608.0, + "78": 544900608.0, + "79": 544900608.0, + "80": 544900608.0, + "81": 544900608.0, + "82": 544900608.0, + "83": 544900608.0, + "84": 544900608.0, + "85": 544900608.0, + "86": 544900608.0, + "87": 544900608.0, + "88": 544900608.0, + "89": 544900608.0, + "90": 544900608.0, + "91": 544900608.0, + "92": 544900608.0, + "93": 544900608.0, + "94": 544900608.0, + "95": 544900608.0, + "96": 544900608.0, + "97": 544900608.0, + "98": 544900608.0, + "99": 544900608.0, + "100": 544900608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1909074432.0, + "52": 1909075456.0, + "53": 1909075456.0, + "54": 1909075456.0, + "55": 1909075456.0, + "56": 1909075456.0, + "57": 1909075456.0, + "58": 1909075456.0, + "59": 1909075456.0, + "60": 1909075456.0, + "61": 1909075456.0, + "62": 1909075456.0, + "63": 1909075456.0, + "64": 1909075456.0, + "65": 1909075456.0, + "66": 1909075456.0, + "67": 1909075456.0, + "68": 1909075456.0, + "69": 1909075456.0, + "70": 1909075456.0, + "71": 1909075456.0, + "72": 1909075456.0, + "73": 1909075456.0, + "74": 1909075456.0, + "75": 1909075456.0, + "76": 1909075456.0, + "77": 1909075456.0, + "78": 1909075456.0, + "79": 1909075456.0, + "80": 1909075456.0, + "81": 1909075456.0, + "82": 1909075456.0, + "83": 1909075456.0, + "84": 1909075456.0, + "85": 1909075456.0, + "86": 1909075456.0, + "87": 1909075456.0, + "88": 1909075456.0, + "89": 1909075456.0, + "90": 1909075456.0, + "91": 1909075456.0, + "92": 1909075456.0, + "93": 1909075456.0, + "94": 1909075456.0, + "95": 1909075456.0, + "96": 1909075456.0, + "97": 1909075456.0, + "98": 1909075456.0, + "99": 1909075456.0, + "100": 1909075456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.55843, + "53": 0.19589, + "54": 0.18471, + "55": 0.18349, + "56": 0.18435, + "57": 0.18418, + "58": 0.18539, + "59": 0.18565, + "60": 0.18678, + "61": 0.18442, + "62": 0.18318, + "63": 0.18314, + "64": 0.18315, + "65": 0.18517, + "66": 0.18507, + "67": 0.18538, + "68": 0.18295, + "69": 0.18459, + "70": 0.18533, + "71": 0.1857, + "72": 0.1861, + "73": 0.18507, + "74": 0.18454, + "75": 0.18322, + "76": 0.18308, + "77": 0.18278, + "78": 0.18413, + "79": 0.18228, + "80": 0.18383, + "81": 0.18491, + "82": 0.18405, + "83": 0.18374, + "84": 0.18428, + "85": 0.18358, + "86": 0.18433, + "87": 0.18542, + "88": 0.18544, + "89": 0.1847, + "90": 0.18536, + "91": 0.18553, + "92": 0.18571, + "93": 0.18611, + "94": 0.18506, + "95": 0.18462, + "96": 0.18458, + "97": 0.18459, + "98": 0.18525, + "99": 0.18232, + "100": 0.18404 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json index 8325c3b9e5b..e00e8181bf7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json @@ -7,103 +7,103 @@ "1": 10.86836, "2": 10.88595, "3": 10.86559, - "4": 10.86889, - "5": 10.87417, - "6": 10.8906, - "7": 10.87677, - "8": 10.86475, - "9": 10.88232, + "4": 10.8689, + "5": 10.8742, + "6": 10.89059, + "7": 10.87676, + "8": 10.86476, + "9": 10.88236, "10": 10.84582, - "11": 10.87162, + "11": 10.87163, "12": 10.87422, "13": 10.88163, - "14": 10.88889, - "15": 10.83931, - "16": 10.82496, - "17": 10.80147, - "18": 10.81234, - "19": 10.82152, - "20": 10.71933, - "21": 10.69091, + "14": 10.88884, + "15": 10.83936, + "16": 10.82495, + "17": 10.80144, + "18": 10.81238, + "19": 10.82157, + "20": 10.71932, + "21": 10.69086, "22": 10.57426, "23": 10.71097, "24": 10.5978, - "25": 10.5556, - "26": 10.61522, - "27": 10.60451, - "28": 10.56484, - "29": 10.58476, - "30": 10.35944, - "31": 10.12157, + "25": 10.55559, + "26": 10.61521, + "27": 10.6045, + "28": 10.56482, + "29": 10.58474, + "30": 10.35947, + "31": 10.12154, "32": 10.45234, - "33": 10.45725, - "34": 10.21989, + "33": 10.45723, + "34": 10.21986, "35": 10.26445, - "36": 10.21036, - "37": 10.33952, - "38": 10.18015, - "39": 10.39589, - "40": 10.06631, + "36": 10.21034, + "37": 10.33954, + "38": 10.18014, + "39": 10.39592, + "40": 10.06632, "41": 10.14164, - "42": 10.20853, - "43": 9.83127, - "44": 9.94861, - "45": 9.82847, - "46": 9.8046, - "47": 10.14233, - "48": 9.84459, - "49": 9.52195, - "50": 9.88603, - "51": 9.84982, - "52": 9.74428, - "53": 10.05844, - "54": 9.95125, + "42": 10.20846, + "43": 9.83129, + "44": 9.94859, + "45": 9.82846, + "46": 9.80459, + "47": 10.1423, + "48": 9.84463, + "49": 9.52196, + "50": 9.88602, + "51": 9.84981, + "52": 9.74433, + "53": 10.05841, + "54": 9.95128, "55": 9.88345, - "56": 9.61327, - "57": 9.469, - "58": 9.82161, - "59": 9.57703, - "60": 9.49786, + "56": 9.61328, + "57": 9.46898, + "58": 9.82164, + "59": 9.577, + "60": 9.49788, "61": 9.69254, - "62": 9.98597, - "63": 9.37405, - "64": 9.76601, - "65": 8.94654, - "66": 9.70099, + "62": 9.98596, + "63": 9.37406, + "64": 9.76602, + "65": 8.94652, + "66": 9.70103, "67": 9.36368, - "68": 9.7824, - "69": 9.7988, - "70": 9.73166, - "71": 9.62509, - "72": 9.58308, - "73": 9.48821, - "74": 8.92607, - "75": 9.40719, - "76": 9.07708, - "77": 10.05856, - "78": 9.72208, - "79": 9.37661, - "80": 9.40273, + "68": 9.78239, + "69": 9.79883, + "70": 9.73167, + "71": 9.62508, + "72": 9.58312, + "73": 9.4882, + "74": 8.92612, + "75": 9.40726, + "76": 9.07709, + "77": 10.05858, + "78": 9.72206, + "79": 9.37662, + "80": 9.40272, "81": 9.48208, - "82": 9.69949, - "83": 9.31353, + "82": 9.69955, + "83": 9.31357, "84": 9.41731, - "85": 9.61581, - "86": 9.07429, + "85": 9.61585, + "86": 9.0743, "87": 9.59556, "88": 9.75063, - "89": 9.60041, - "90": 9.82207, - "91": 9.33877, - "92": 9.35776, - "93": 9.0867, - "94": 8.8296, - "95": 9.52595, - "96": 9.52972, + "89": 9.60037, + "90": 9.82206, + "91": 9.33875, + "92": 9.3578, + "93": 9.08666, + "94": 8.82958, + "95": 9.52592, + "96": 9.52973, "97": 9.30331, - "98": 9.67136, - "99": 8.89539, - "100": 9.40568 + "98": 9.67138, + "99": 8.89537, + "100": 9.40567 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 1595.0, - "2": 1608.0, - "3": 1639.0, - "4": 1667.0, - "5": 1811.0, - "6": 1793.0, - "7": 1871.0, - "8": 1579.0, - "9": 1850.0, - "10": 1352.0, - "11": 1853.0, - "12": 1662.0, - "13": 1705.0, - "14": 1659.0, - "15": 1812.0, - "16": 1720.0, - "17": 1682.0, - "18": 1583.0, - "19": 1728.0, - "20": 1655.0, - "21": 1978.0, - "22": 1627.0, - "23": 1863.0, - "24": 1654.0, - "25": 1514.0, - "26": 1697.0, - "27": 1653.0, - "28": 1949.0, - "29": 1931.0, - "30": 1896.0, - "31": 1522.0, - "32": 1915.0, - "33": 2134.0, - "34": 1700.0, - "35": 1860.0, - "36": 1880.0, - "37": 2310.0, - "38": 2101.0, - "39": 2417.0, - "40": 2076.0, - "41": 2319.0, - "42": 2199.0, - "43": 1874.0, - "44": 2080.0, - "45": 1980.0, - "46": 2302.0, - "47": 2470.0, - "48": 2202.0, - "49": 2280.0, - "50": 2439.0, - "51": 2490.0, - "52": 2545.0, - "53": 2999.0, - "54": 2565.0, - "55": 2285.0, - "56": 2699.0, - "57": 2189.0, - "58": 2878.0, - "59": 2978.0, - "60": 2478.0, - "61": 2815.0, - "62": 2666.0, - "63": 2512.0, - "64": 2966.0, - "65": 2533.0, - "66": 2865.0, - "67": 2741.0, - "68": 2760.0, - "69": 2810.0, - "70": 3115.0, - "71": 2918.0, - "72": 2413.0, - "73": 2837.0, - "74": 1901.0, - "75": 2387.0, - "76": 2899.0, - "77": 3019.0, - "78": 3233.0, - "79": 3193.0, - "80": 3288.0, - "81": 3397.0, - "82": 3181.0, - "83": 2672.0, - "84": 3163.0, - "85": 3128.0, - "86": 2647.0, - "87": 3754.0, - "88": 3098.0, - "89": 3372.0, - "90": 2966.0, - "91": 2776.0, - "92": 2983.0, - "93": 2767.0, - "94": 3263.0, - "95": 3238.0, - "96": 3471.0, - "97": 3231.0, - "98": 3528.0, - "99": 3090.0, - "100": 3319.0 + "1": 1621.0, + "2": 1581.0, + "3": 1660.0, + "4": 1639.0, + "5": 1858.0, + "6": 1746.0, + "7": 1789.0, + "8": 1599.0, + "9": 1866.0, + "10": 1400.0, + "11": 1838.0, + "12": 1702.0, + "13": 1844.0, + "14": 1707.0, + "15": 1824.0, + "16": 1828.0, + "17": 1810.0, + "18": 1568.0, + "19": 1747.0, + "20": 1605.0, + "21": 1936.0, + "22": 1586.0, + "23": 1869.0, + "24": 1508.0, + "25": 1506.0, + "26": 1674.0, + "27": 1742.0, + "28": 1978.0, + "29": 1867.0, + "30": 1888.0, + "31": 1551.0, + "32": 1866.0, + "33": 2085.0, + "34": 1816.0, + "35": 1884.0, + "36": 1866.0, + "37": 2390.0, + "38": 2008.0, + "39": 2403.0, + "40": 2077.0, + "41": 2225.0, + "42": 2252.0, + "43": 1924.0, + "44": 2075.0, + "45": 1956.0, + "46": 2175.0, + "47": 2425.0, + "48": 2225.0, + "49": 2244.0, + "50": 2430.0, + "51": 2464.0, + "52": 2572.0, + "53": 2977.0, + "54": 2621.0, + "55": 2248.0, + "56": 2813.0, + "57": 2293.0, + "58": 2874.0, + "59": 2959.0, + "60": 2499.0, + "61": 2762.0, + "62": 2658.0, + "63": 2472.0, + "64": 2840.0, + "65": 2587.0, + "66": 2880.0, + "67": 2813.0, + "68": 2775.0, + "69": 2821.0, + "70": 3127.0, + "71": 2870.0, + "72": 2609.0, + "73": 2835.0, + "74": 1993.0, + "75": 2474.0, + "76": 2896.0, + "77": 3050.0, + "78": 3120.0, + "79": 3093.0, + "80": 3284.0, + "81": 3502.0, + "82": 3169.0, + "83": 2614.0, + "84": 3087.0, + "85": 3140.0, + "86": 2590.0, + "87": 3631.0, + "88": 3120.0, + "89": 3308.0, + "90": 3137.0, + "91": 2801.0, + "92": 2977.0, + "93": 2727.0, + "94": 3180.0, + "95": 3264.0, + "96": 3436.0, + "97": 3329.0, + "98": 3661.0, + "99": 3152.0, + "100": 3367.0 } }, "mem-allocated-bytes": { @@ -218,106 +218,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 464552448.0, - "2": 464552448.0, - "3": 464552448.0, - "4": 464552448.0, - "5": 464552448.0, - "6": 464552448.0, - "7": 464552448.0, - "8": 464552448.0, - "9": 464552448.0, - "10": 464552448.0, - "11": 464552448.0, - "12": 464552448.0, - "13": 464552448.0, - "14": 464552448.0, - "15": 464552448.0, - "16": 464552448.0, - "17": 464552448.0, - "18": 464552448.0, - "19": 464552448.0, - "20": 464552448.0, - "21": 464552448.0, - "22": 464552448.0, - "23": 464552448.0, - "24": 464552448.0, - "25": 464552448.0, - "26": 464552448.0, - "27": 464552448.0, - "28": 464552448.0, - "29": 464552448.0, - "30": 464552448.0, - "31": 464552448.0, - "32": 464552448.0, - "33": 464552448.0, - "34": 464552448.0, - "35": 464552448.0, - "36": 464552448.0, - "37": 464552448.0, - "38": 464552448.0, - "39": 464552448.0, - "40": 464552448.0, - "41": 464552448.0, - "42": 464552448.0, - "43": 464552448.0, - "44": 464552448.0, - "45": 464552448.0, - "46": 464552448.0, - "47": 464552448.0, - "48": 464552448.0, - "49": 464552448.0, - "50": 464552448.0, - "51": 464552448.0, - "52": 464552448.0, - "53": 464552448.0, - "54": 464552448.0, - "55": 464552448.0, - "56": 464552448.0, - "57": 464552448.0, - "58": 464552448.0, - "59": 464552448.0, - "60": 464552448.0, - "61": 464552448.0, - "62": 464552448.0, - "63": 464552448.0, - "64": 464552448.0, - "65": 464552448.0, - "66": 464552448.0, - "67": 464552448.0, - "68": 464552448.0, - "69": 464552448.0, - "70": 464552448.0, - "71": 464552448.0, - "72": 464552448.0, - "73": 464552448.0, - "74": 464552448.0, - "75": 464552448.0, - "76": 464552448.0, - "77": 464552448.0, - "78": 464552448.0, - "79": 464552448.0, - "80": 464552448.0, - "81": 464552448.0, - "82": 464552448.0, - "83": 464552448.0, - "84": 464552448.0, - "85": 464552448.0, - "86": 464552448.0, - "87": 464552448.0, - "88": 464552448.0, - "89": 464552448.0, - "90": 464552448.0, - "91": 464552448.0, - "92": 464552448.0, - "93": 464552448.0, - "94": 464552448.0, - "95": 464552448.0, - "96": 464552448.0, - "97": 464552448.0, - "98": 464552448.0, - "99": 464552448.0, - "100": 464552448.0 + "1": 465207808.0, + "2": 466256384.0, + "3": 466256384.0, + "4": 466256384.0, + "5": 466256384.0, + "6": 466256384.0, + "7": 466256384.0, + "8": 466256384.0, + "9": 466256384.0, + "10": 466256384.0, + "11": 466256384.0, + "12": 466256384.0, + "13": 466256384.0, + "14": 466256384.0, + "15": 466256384.0, + "16": 466256384.0, + "17": 466256384.0, + "18": 466256384.0, + "19": 466256384.0, + "20": 466256384.0, + "21": 466256384.0, + "22": 466256384.0, + "23": 466256384.0, + "24": 466256384.0, + "25": 466256384.0, + "26": 466256384.0, + "27": 466256384.0, + "28": 466256384.0, + "29": 466256384.0, + "30": 466256384.0, + "31": 466256384.0, + "32": 466256384.0, + "33": 466256384.0, + "34": 466256384.0, + "35": 466256384.0, + "36": 466256384.0, + "37": 466256384.0, + "38": 466256384.0, + "39": 466256384.0, + "40": 466256384.0, + "41": 466256384.0, + "42": 466256384.0, + "43": 466256384.0, + "44": 466256384.0, + "45": 466256384.0, + "46": 466256384.0, + "47": 466256384.0, + "48": 466256384.0, + "49": 466256384.0, + "50": 466256384.0, + "51": 466256384.0, + "52": 466256384.0, + "53": 466256384.0, + "54": 466256384.0, + "55": 466256384.0, + "56": 466256384.0, + "57": 466256384.0, + "58": 466256384.0, + "59": 466256384.0, + "60": 466256384.0, + "61": 466256384.0, + "62": 466256384.0, + "63": 466256384.0, + "64": 466256384.0, + "65": 466256384.0, + "66": 466256384.0, + "67": 466256384.0, + "68": 466256384.0, + "69": 466256384.0, + "70": 466256384.0, + "71": 466256384.0, + "72": 466256384.0, + "73": 466256384.0, + "74": 466256384.0, + "75": 466256384.0, + "76": 466256384.0, + "77": 466256384.0, + "78": 466256384.0, + "79": 466256384.0, + "80": 466256384.0, + "81": 466256384.0, + "82": 466256384.0, + "83": 466256384.0, + "84": 466256384.0, + "85": 466256384.0, + "86": 466256384.0, + "87": 466256384.0, + "88": 466256384.0, + "89": 466256384.0, + "90": 466256384.0, + "91": 466256384.0, + "92": 466256384.0, + "93": 466256384.0, + "94": 466256384.0, + "95": 466256384.0, + "96": 466256384.0, + "97": 466256384.0, + "98": 466256384.0, + "99": 466256384.0, + "100": 466256384.0 } }, "mem-max-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 1196456448.0, - "2": 1374693888.0, - "3": 1374693888.0, - "4": 1374693888.0, - "5": 1374693888.0, - "6": 1374693888.0, - "7": 1374693888.0, - "8": 1374693888.0, - "9": 1374693888.0, - "10": 1374693888.0, - "11": 1374693888.0, - "12": 1374693888.0, - "13": 1374693888.0, - "14": 1374693888.0, - "15": 1374693888.0, - "16": 1374693888.0, - "17": 1374693888.0, - "18": 1374693888.0, - "19": 1374693888.0, - "20": 1374693888.0, - "21": 1374693888.0, - "22": 1374693888.0, - "23": 1374693888.0, - "24": 1374693888.0, - "25": 1374693888.0, - "26": 1374693888.0, - "27": 1374693888.0, - "28": 1374693888.0, - "29": 1374693888.0, - "30": 1374693888.0, - "31": 1374693888.0, - "32": 1374693888.0, - "33": 1374693888.0, - "34": 1374693888.0, - "35": 1374693888.0, - "36": 1374693888.0, - "37": 1374693888.0, - "38": 1374693888.0, - "39": 1374693888.0, - "40": 1374693888.0, - "41": 1374693888.0, - "42": 1374693888.0, - "43": 1374693888.0, - "44": 1374693888.0, - "45": 1374693888.0, - "46": 1374693888.0, - "47": 1374693888.0, - "48": 1374693888.0, - "49": 1374693888.0, - "50": 1374693888.0, - "51": 1374693888.0, - "52": 1374693888.0, - "53": 1374693888.0, - "54": 1374693888.0, - "55": 1374693888.0, - "56": 1374693888.0, - "57": 1374693888.0, - "58": 1374693888.0, - "59": 1374693888.0, - "60": 1374693888.0, - "61": 1374693888.0, - "62": 1374693888.0, - "63": 1374693888.0, - "64": 1374693888.0, - "65": 1374693888.0, - "66": 1374693888.0, - "67": 1374693888.0, - "68": 1374693888.0, - "69": 1374693888.0, - "70": 1374693888.0, - "71": 1374693888.0, - "72": 1374693888.0, - "73": 1374693888.0, - "74": 1374693888.0, - "75": 1374693888.0, - "76": 1374693888.0, - "77": 1374693888.0, - "78": 1374693888.0, - "79": 1374693888.0, - "80": 1374693888.0, - "81": 1374693888.0, - "82": 1374693888.0, - "83": 1374693888.0, - "84": 1374693888.0, - "85": 1374693888.0, - "86": 1374693888.0, - "87": 1374693888.0, - "88": 1374693888.0, - "89": 1374693888.0, - "90": 1374693888.0, - "91": 1374693888.0, - "92": 1374693888.0, - "93": 1374693888.0, - "94": 1374693888.0, - "95": 1374693888.0, - "96": 1374693888.0, - "97": 1374693888.0, - "98": 1374693888.0, - "99": 1374693888.0, - "100": 1374693888.0 + "1": 1195407872.0, + "2": 1376397824.0, + "3": 1376397824.0, + "4": 1376397824.0, + "5": 1376397824.0, + "6": 1376397824.0, + "7": 1376397824.0, + "8": 1376397824.0, + "9": 1376397824.0, + "10": 1376397824.0, + "11": 1376397824.0, + "12": 1376397824.0, + "13": 1376397824.0, + "14": 1376397824.0, + "15": 1376397824.0, + "16": 1376397824.0, + "17": 1376397824.0, + "18": 1376397824.0, + "19": 1376397824.0, + "20": 1376397824.0, + "21": 1376397824.0, + "22": 1376397824.0, + "23": 1376397824.0, + "24": 1376397824.0, + "25": 1376397824.0, + "26": 1376397824.0, + "27": 1376397824.0, + "28": 1376397824.0, + "29": 1376397824.0, + "30": 1376397824.0, + "31": 1376397824.0, + "32": 1376397824.0, + "33": 1376397824.0, + "34": 1376397824.0, + "35": 1376397824.0, + "36": 1376397824.0, + "37": 1376397824.0, + "38": 1376397824.0, + "39": 1376397824.0, + "40": 1376397824.0, + "41": 1376397824.0, + "42": 1376397824.0, + "43": 1376397824.0, + "44": 1376397824.0, + "45": 1376397824.0, + "46": 1376397824.0, + "47": 1376397824.0, + "48": 1376397824.0, + "49": 1376397824.0, + "50": 1376397824.0, + "51": 1376397824.0, + "52": 1376397824.0, + "53": 1376397824.0, + "54": 1376397824.0, + "55": 1376397824.0, + "56": 1376397824.0, + "57": 1376397824.0, + "58": 1376397824.0, + "59": 1376397824.0, + "60": 1376397824.0, + "61": 1376397824.0, + "62": 1376397824.0, + "63": 1376397824.0, + "64": 1376397824.0, + "65": 1376397824.0, + "66": 1376397824.0, + "67": 1376397824.0, + "68": 1376397824.0, + "69": 1376397824.0, + "70": 1376397824.0, + "71": 1376397824.0, + "72": 1376397824.0, + "73": 1376397824.0, + "74": 1376397824.0, + "75": 1376397824.0, + "76": 1376397824.0, + "77": 1376397824.0, + "78": 1376397824.0, + "79": 1376397824.0, + "80": 1376397824.0, + "81": 1376397824.0, + "82": 1376397824.0, + "83": 1376397824.0, + "84": 1376397824.0, + "85": 1376397824.0, + "86": 1376397824.0, + "87": 1376397824.0, + "88": 1376397824.0, + "89": 1376397824.0, + "90": 1376397824.0, + "91": 1376397824.0, + "92": 1376397824.0, + "93": 1376397824.0, + "94": 1376397824.0, + "95": 1376397824.0, + "96": 1376397824.0, + "97": 1376397824.0, + "98": 1376397824.0, + "99": 1376397824.0, + "100": 1376397824.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 9.03488, - "2": 0.20387, - "3": 0.20622, - "4": 0.19336, - "5": 0.19521, - "6": 0.20191, - "7": 0.19444, - "8": 0.23348, - "9": 0.25611, - "10": 0.24659, - "11": 0.19017, - "12": 0.2556, - "13": 0.18852, - "14": 0.18766, - "15": 0.2289, - "16": 0.18803, - "17": 0.18847, - "18": 0.18567, - "19": 0.18706, - "20": 0.18811, - "21": 0.20215, - "22": 0.39605, - "23": 0.18875, - "24": 0.21086, - "25": 0.18732, - "26": 0.18675, - "27": 0.18833, - "28": 0.23402, - "29": 0.18843, - "30": 0.18769, - "31": 0.21593, - "32": 0.21936, - "33": 0.18843, - "34": 0.21993, - "35": 0.18728, - "36": 0.18741, - "37": 0.18775, - "38": 0.22431, - "39": 0.24159, - "40": 0.25325, - "41": 0.18582, - "42": 0.18658, - "43": 0.24562, - "44": 0.30876, - "45": 0.22398, - "46": 0.18667, - "47": 0.18821, - "48": 0.18742, - "49": 0.20501, - "50": 0.18644, - "51": 0.19893, - "52": 0.18375, - "53": 0.18186, - "54": 0.18268, - "55": 0.18616, - "56": 0.32841, - "57": 0.18567, - "58": 0.41637, - "59": 0.25482, - "60": 0.18467, - "61": 0.21026, - "62": 0.18373, - "63": 0.20727, - "64": 0.44141, - "65": 0.18532, - "66": 0.18662, - "67": 0.18805, - "68": 0.1877, - "69": 0.18579, - "70": 0.18644, - "71": 0.20361, - "72": 0.25218, - "73": 0.18582, - "74": 0.21341, - "75": 0.1876, - "76": 0.18385, - "77": 0.18512, - "78": 0.18447, - "79": 0.18604, - "80": 0.44402, - "81": 0.22886, - "82": 0.18502, - "83": 0.18578, - "84": 0.18519, - "85": 0.18624, - "86": 0.18704, - "87": 0.18561, - "88": 0.1864, - "89": 0.18676, - "90": 0.18596, - "91": 0.18759, - "92": 0.18643, - "93": 0.2303, - "94": 0.18509, - "95": 0.18557, - "96": 0.22378, - "97": 0.18724, - "98": 0.18202, - "99": 0.19781, - "100": 0.22613 + "1": "nan", + "2": 2.70208, + "3": 0.19598, + "4": 0.17697, + "5": 0.17606, + "6": 0.17518, + "7": 0.17383, + "8": 0.17622, + "9": 0.17697, + "10": 0.17845, + "11": 0.17811, + "12": 0.17772, + "13": 0.17922, + "14": 0.17797, + "15": 0.17934, + "16": 0.18103, + "17": 0.18059, + "18": 0.17963, + "19": 0.18123, + "20": 0.18073, + "21": 0.18061, + "22": 0.18072, + "23": 0.17975, + "24": 0.18067, + "25": 0.18039, + "26": 0.17929, + "27": 0.17948, + "28": 0.17917, + "29": 0.17952, + "30": 0.17908, + "31": 0.1795, + "32": 0.17963, + "33": 0.17979, + "34": 0.18047, + "35": 0.18032, + "36": 0.18061, + "37": 0.17948, + "38": 0.17887, + "39": 0.17897, + "40": 0.17957, + "41": 0.17982, + "42": 0.1808, + "43": 0.17965, + "44": 0.1807, + "45": 0.18099, + "46": 0.17964, + "47": 0.17861, + "48": 0.17946, + "49": 0.18292, + "50": 0.18089, + "51": 0.18183, + "52": 0.16643, + "53": 0.16906, + "54": 0.16731, + "55": 0.16773, + "56": 0.16957, + "57": 0.1691, + "58": 0.17123, + "59": 0.17207, + "60": 0.17308, + "61": 0.17219, + "62": 0.17353, + "63": 0.17543, + "64": 0.17335, + "65": 0.17469, + "66": 0.17402, + "67": 0.17585, + "68": 0.17421, + "69": 0.17363, + "70": 0.1748, + "71": 0.17377, + "72": 0.17421, + "73": 0.17466, + "74": 0.17508, + "75": 0.17297, + "76": 0.17297, + "77": 0.17289, + "78": 0.17516, + "79": 0.17501, + "80": 0.17483, + "81": 0.17493, + "82": 0.17481, + "83": 0.17496, + "84": 0.17501, + "85": 0.17642, + "86": 0.17507, + "87": 0.17445, + "88": 0.17535, + "89": 0.17531, + "90": 0.17467, + "91": 0.17485, + "92": 0.17537, + "93": 0.17577, + "94": 0.1757, + "95": 0.1752, + "96": 0.17534, + "97": 0.17544, + "98": 0.17458, + "99": 0.17379, + "100": 0.17525 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..bb062f69f88 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.84981, + "52": 9.74427, + "53": 10.05843, + "54": 9.9513, + "55": 9.88338, + "56": 9.61325, + "57": 9.46895, + "58": 9.82166, + "59": 9.57703, + "60": 9.49787, + "61": 9.69257, + "62": 9.98595, + "63": 9.37399, + "64": 9.76604, + "65": 8.94651, + "66": 9.70103, + "67": 9.36368, + "68": 9.78235, + "69": 9.79883, + "70": 9.73165, + "71": 9.62507, + "72": 9.5831, + "73": 9.48817, + "74": 8.92613, + "75": 9.40726, + "76": 9.07706, + "77": 10.0586, + "78": 9.72205, + "79": 9.37661, + "80": 9.4027, + "81": 9.48209, + "82": 9.69951, + "83": 9.31355, + "84": 9.41731, + "85": 9.61584, + "86": 9.07426, + "87": 9.59553, + "88": 9.75065, + "89": 9.60039, + "90": 9.82207, + "91": 9.33876, + "92": 9.35777, + "93": 9.08671, + "94": 8.82959, + "95": 9.52597, + "96": 9.52973, + "97": 9.30334, + "98": 9.67135, + "99": 8.89539, + "100": 9.40569 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2483.0, + "52": 2539.0, + "53": 2798.0, + "54": 2526.0, + "55": 2228.0, + "56": 2738.0, + "57": 2252.0, + "58": 2797.0, + "59": 2893.0, + "60": 2453.0, + "61": 2929.0, + "62": 2698.0, + "63": 2347.0, + "64": 2902.0, + "65": 2556.0, + "66": 2922.0, + "67": 2829.0, + "68": 2669.0, + "69": 2814.0, + "70": 3041.0, + "71": 2872.0, + "72": 2512.0, + "73": 2971.0, + "74": 1870.0, + "75": 2349.0, + "76": 2844.0, + "77": 3121.0, + "78": 3116.0, + "79": 3196.0, + "80": 3164.0, + "81": 3454.0, + "82": 3176.0, + "83": 2613.0, + "84": 3093.0, + "85": 3128.0, + "86": 2792.0, + "87": 3771.0, + "88": 3108.0, + "89": 3297.0, + "90": 3042.0, + "91": 2850.0, + "92": 2873.0, + "93": 2709.0, + "94": 3294.0, + "95": 3282.0, + "96": 3536.0, + "97": 3150.0, + "98": 3479.0, + "99": 3113.0, + "100": 3370.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 465864192.0, + "52": 465864192.0, + "53": 465864192.0, + "54": 465864192.0, + "55": 465864192.0, + "56": 465864192.0, + "57": 465864192.0, + "58": 465864192.0, + "59": 465864192.0, + "60": 465864192.0, + "61": 465864192.0, + "62": 465864192.0, + "63": 465864192.0, + "64": 465864192.0, + "65": 465864192.0, + "66": 465864192.0, + "67": 465864192.0, + "68": 465864192.0, + "69": 465864192.0, + "70": 465864192.0, + "71": 465864192.0, + "72": 465864192.0, + "73": 465864192.0, + "74": 465864192.0, + "75": 465864192.0, + "76": 465864192.0, + "77": 465864192.0, + "78": 465864192.0, + "79": 465864192.0, + "80": 465864192.0, + "81": 465864192.0, + "82": 465864192.0, + "83": 465864192.0, + "84": 465864192.0, + "85": 465864192.0, + "86": 465864192.0, + "87": 465864192.0, + "88": 465864192.0, + "89": 465864192.0, + "90": 465864192.0, + "91": 465864192.0, + "92": 465864192.0, + "93": 465864192.0, + "94": 465864192.0, + "95": 465864192.0, + "96": 465864192.0, + "97": 465864192.0, + "98": 465864192.0, + "99": 465864192.0, + "100": 465864192.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1378101760.0, + "52": 1378102784.0, + "53": 1378102784.0, + "54": 1378102784.0, + "55": 1378102784.0, + "56": 1378102784.0, + "57": 1378102784.0, + "58": 1378102784.0, + "59": 1378102784.0, + "60": 1378102784.0, + "61": 1378102784.0, + "62": 1378102784.0, + "63": 1378102784.0, + "64": 1378102784.0, + "65": 1378102784.0, + "66": 1378102784.0, + "67": 1378102784.0, + "68": 1378102784.0, + "69": 1378102784.0, + "70": 1378102784.0, + "71": 1378102784.0, + "72": 1378102784.0, + "73": 1378102784.0, + "74": 1378102784.0, + "75": 1378102784.0, + "76": 1378102784.0, + "77": 1378102784.0, + "78": 1378102784.0, + "79": 1378102784.0, + "80": 1378102784.0, + "81": 1378102784.0, + "82": 1378102784.0, + "83": 1378102784.0, + "84": 1378102784.0, + "85": 1378102784.0, + "86": 1378102784.0, + "87": 1378102784.0, + "88": 1378102784.0, + "89": 1378102784.0, + "90": 1378102784.0, + "91": 1378102784.0, + "92": 1378102784.0, + "93": 1378102784.0, + "94": 1378102784.0, + "95": 1378102784.0, + "96": 1378102784.0, + "97": 1378102784.0, + "98": 1378102784.0, + "99": 1378102784.0, + "100": 1378102784.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.91958, + "53": 0.19241, + "54": 0.1787, + "55": 0.17896, + "56": 0.17921, + "57": 0.17976, + "58": 0.17909, + "59": 0.18055, + "60": 0.18105, + "61": 0.18154, + "62": 0.18039, + "63": 0.18167, + "64": 0.1811, + "65": 0.18155, + "66": 0.18214, + "67": 0.18228, + "68": 0.18197, + "69": 0.18293, + "70": 0.1812, + "71": 0.18051, + "72": 0.18186, + "73": 0.18056, + "74": 0.17931, + "75": 0.17657, + "76": 0.17103, + "77": 0.16975, + "78": 0.17336, + "79": 0.17356, + "80": 0.17203, + "81": 0.17343, + "82": 0.17407, + "83": 0.17347, + "84": 0.17434, + "85": 0.17348, + "86": 0.17449, + "87": 0.17439, + "88": 0.17349, + "89": 0.17397, + "90": 0.17349, + "91": 0.17383, + "92": 0.17402, + "93": 0.17583, + "94": 0.17507, + "95": 0.17414, + "96": 0.17276, + "97": 0.17329, + "98": 0.17376, + "99": 0.17325, + "100": 0.17482 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json index b3990651f36..7dd5b31f34f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json @@ -218,106 +218,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 700103168.0, - "2": 700103168.0, - "3": 700103168.0, - "4": 700103168.0, - "5": 700103168.0, - "6": 700103168.0, - "7": 700103168.0, - "8": 700103168.0, - "9": 700103168.0, - "10": 700103168.0, - "11": 700103168.0, - "12": 700103168.0, - "13": 700103168.0, - "14": 700103168.0, - "15": 700103168.0, - "16": 700103168.0, - "17": 700103168.0, - "18": 700103168.0, - "19": 700103168.0, - "20": 700103168.0, - "21": 700103168.0, - "22": 700103168.0, - "23": 700103168.0, - "24": 700103168.0, - "25": 700103168.0, - "26": 700103168.0, - "27": 700103168.0, - "28": 700103168.0, - "29": 700103168.0, - "30": 700103168.0, - "31": 700103168.0, - "32": 700103168.0, - "33": 700103168.0, - "34": 700103168.0, - "35": 700103168.0, - "36": 700103168.0, - "37": 700103168.0, - "38": 700103168.0, - "39": 700103168.0, - "40": 700103168.0, - "41": 700103168.0, - "42": 700103168.0, - "43": 700103168.0, - "44": 700103168.0, - "45": 700103168.0, - "46": 700103168.0, - "47": 700103168.0, - "48": 700103168.0, - "49": 700103168.0, - "50": 700103168.0, - "51": 700103168.0, - "52": 700103168.0, - "53": 700103168.0, - "54": 700103168.0, - "55": 700103168.0, - "56": 700103168.0, - "57": 700103168.0, - "58": 700103168.0, - "59": 700103168.0, - "60": 700103168.0, - "61": 700103168.0, - "62": 700103168.0, - "63": 700103168.0, - "64": 700103168.0, - "65": 700103168.0, - "66": 700103168.0, - "67": 700103168.0, - "68": 700103168.0, - "69": 700103168.0, - "70": 700103168.0, - "71": 700103168.0, - "72": 700103168.0, - "73": 700103168.0, - "74": 700103168.0, - "75": 700103168.0, - "76": 700103168.0, - "77": 700103168.0, - "78": 700103168.0, - "79": 700103168.0, - "80": 700103168.0, - "81": 700103168.0, - "82": 700103168.0, - "83": 700103168.0, - "84": 700103168.0, - "85": 700103168.0, - "86": 700103168.0, - "87": 700103168.0, - "88": 700103168.0, - "89": 700103168.0, - "90": 700103168.0, - "91": 700103168.0, - "92": 700103168.0, - "93": 700103168.0, - "94": 700103168.0, - "95": 700103168.0, - "96": 700103168.0, - "97": 700103168.0, - "98": 700103168.0, - "99": 700103168.0, - "100": 700103168.0 + "1": 246999552.0, + "2": 246999552.0, + "3": 246999552.0, + "4": 246999552.0, + "5": 246999552.0, + "6": 246999552.0, + "7": 246999552.0, + "8": 246999552.0, + "9": 246999552.0, + "10": 246999552.0, + "11": 246999552.0, + "12": 246999552.0, + "13": 246999552.0, + "14": 246999552.0, + "15": 246999552.0, + "16": 246999552.0, + "17": 246999552.0, + "18": 246999552.0, + "19": 246999552.0, + "20": 246999552.0, + "21": 246999552.0, + "22": 246999552.0, + "23": 246999552.0, + "24": 246999552.0, + "25": 246999552.0, + "26": 246999552.0, + "27": 246999552.0, + "28": 246999552.0, + "29": 246999552.0, + "30": 246999552.0, + "31": 246999552.0, + "32": 246999552.0, + "33": 246999552.0, + "34": 246999552.0, + "35": 246999552.0, + "36": 246999552.0, + "37": 246999552.0, + "38": 246999552.0, + "39": 246999552.0, + "40": 246999552.0, + "41": 246999552.0, + "42": 246999552.0, + "43": 246999552.0, + "44": 246999552.0, + "45": 246999552.0, + "46": 246999552.0, + "47": 246999552.0, + "48": 246999552.0, + "49": 246999552.0, + "50": 246999552.0, + "51": 246999552.0, + "52": 246999552.0, + "53": 246999552.0, + "54": 246999552.0, + "55": 246999552.0, + "56": 246999552.0, + "57": 246999552.0, + "58": 246999552.0, + "59": 246999552.0, + "60": 246999552.0, + "61": 246999552.0, + "62": 246999552.0, + "63": 246999552.0, + "64": 246999552.0, + "65": 246999552.0, + "66": 246999552.0, + "67": 246999552.0, + "68": 246999552.0, + "69": 246999552.0, + "70": 246999552.0, + "71": 246999552.0, + "72": 246999552.0, + "73": 246999552.0, + "74": 246999552.0, + "75": 246999552.0, + "76": 246999552.0, + "77": 246999552.0, + "78": 246999552.0, + "79": 246999552.0, + "80": 246999552.0, + "81": 246999552.0, + "82": 246999552.0, + "83": 246999552.0, + "84": 246999552.0, + "85": 246999552.0, + "86": 246999552.0, + "87": 246999552.0, + "88": 246999552.0, + "89": 246999552.0, + "90": 246999552.0, + "91": 246999552.0, + "92": 246999552.0, + "93": 246999552.0, + "94": 246999552.0, + "95": 246999552.0, + "96": 246999552.0, + "97": 246999552.0, + "98": 246999552.0, + "99": 246999552.0, + "100": 246999552.0 } }, "mem-max-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 1981478400.0, - "2": 1981479424.0, - "3": 1981479424.0, - "4": 1981479424.0, - "5": 1981479424.0, - "6": 1981479424.0, - "7": 1981479424.0, - "8": 1981479424.0, - "9": 1981479424.0, - "10": 1981479424.0, - "11": 1981479424.0, - "12": 1981479424.0, - "13": 1981479424.0, - "14": 1981479424.0, - "15": 1981479424.0, - "16": 1981479424.0, - "17": 1981479424.0, - "18": 1981479424.0, - "19": 1981479424.0, - "20": 1981479424.0, - "21": 1981479424.0, - "22": 1981479424.0, - "23": 1981479424.0, - "24": 1981479424.0, - "25": 1981479424.0, - "26": 1981479424.0, - "27": 1981479424.0, - "28": 1981479424.0, - "29": 1981479424.0, - "30": 1981479424.0, - "31": 1981479424.0, - "32": 1981479424.0, - "33": 1981479424.0, - "34": 1981479424.0, - "35": 1981479424.0, - "36": 1981479424.0, - "37": 1981479424.0, - "38": 1981479424.0, - "39": 1981479424.0, - "40": 1981479424.0, - "41": 1981479424.0, - "42": 1981479424.0, - "43": 1981479424.0, - "44": 1981479424.0, - "45": 1981479424.0, - "46": 1981479424.0, - "47": 1981479424.0, - "48": 1981479424.0, - "49": 1981479424.0, - "50": 1981479424.0, - "51": 1981479424.0, - "52": 1981479424.0, - "53": 1981479424.0, - "54": 1981479424.0, - "55": 1981479424.0, - "56": 1981479424.0, - "57": 1981479424.0, - "58": 1981479424.0, - "59": 1981479424.0, - "60": 1981479424.0, - "61": 1981479424.0, - "62": 1981479424.0, - "63": 1981479424.0, - "64": 1981479424.0, - "65": 1981479424.0, - "66": 1981479424.0, - "67": 1981479424.0, - "68": 1981479424.0, - "69": 1981479424.0, - "70": 1981479424.0, - "71": 1981479424.0, - "72": 1981479424.0, - "73": 1981479424.0, - "74": 1981479424.0, - "75": 1981479424.0, - "76": 1981479424.0, - "77": 1981479424.0, - "78": 1981479424.0, - "79": 1981479424.0, - "80": 1981479424.0, - "81": 1981479424.0, - "82": 1981479424.0, - "83": 1981479424.0, - "84": 1981479424.0, - "85": 1981479424.0, - "86": 1981479424.0, - "87": 1981479424.0, - "88": 1981479424.0, - "89": 1981479424.0, - "90": 1981479424.0, - "91": 1981479424.0, - "92": 1981479424.0, - "93": 1981479424.0, - "94": 1981479424.0, - "95": 1981479424.0, - "96": 1981479424.0, - "97": 1981479424.0, - "98": 1981479424.0, - "99": 1981479424.0, - "100": 1981479424.0 + "1": 1528374784.0, + "2": 1528375808.0, + "3": 1528375808.0, + "4": 1528375808.0, + "5": 1528375808.0, + "6": 1528375808.0, + "7": 1528375808.0, + "8": 1528375808.0, + "9": 1528375808.0, + "10": 1528375808.0, + "11": 1528375808.0, + "12": 1528375808.0, + "13": 1528375808.0, + "14": 1528375808.0, + "15": 1528375808.0, + "16": 1528375808.0, + "17": 1528375808.0, + "18": 1528375808.0, + "19": 1528375808.0, + "20": 1528375808.0, + "21": 1528375808.0, + "22": 1528375808.0, + "23": 1528375808.0, + "24": 1528375808.0, + "25": 1528375808.0, + "26": 1528375808.0, + "27": 1528375808.0, + "28": 1528375808.0, + "29": 1528375808.0, + "30": 1528375808.0, + "31": 1528375808.0, + "32": 1528375808.0, + "33": 1528375808.0, + "34": 1528375808.0, + "35": 1528375808.0, + "36": 1528375808.0, + "37": 1528375808.0, + "38": 1528375808.0, + "39": 1528375808.0, + "40": 1528375808.0, + "41": 1528375808.0, + "42": 1528375808.0, + "43": 1528375808.0, + "44": 1528375808.0, + "45": 1528375808.0, + "46": 1528375808.0, + "47": 1528375808.0, + "48": 1528375808.0, + "49": 1528375808.0, + "50": 1528375808.0, + "51": 1528375808.0, + "52": 1528375808.0, + "53": 1528375808.0, + "54": 1528375808.0, + "55": 1528375808.0, + "56": 1528375808.0, + "57": 1528375808.0, + "58": 1528375808.0, + "59": 1528375808.0, + "60": 1528375808.0, + "61": 1528375808.0, + "62": 1528375808.0, + "63": 1528375808.0, + "64": 1528375808.0, + "65": 1528375808.0, + "66": 1528375808.0, + "67": 1528375808.0, + "68": 1528375808.0, + "69": 1528375808.0, + "70": 1528375808.0, + "71": 1528375808.0, + "72": 1528375808.0, + "73": 1528375808.0, + "74": 1528375808.0, + "75": 1528375808.0, + "76": 1528375808.0, + "77": 1528375808.0, + "78": 1528375808.0, + "79": 1528375808.0, + "80": 1528375808.0, + "81": 1528375808.0, + "82": 1528375808.0, + "83": 1528375808.0, + "84": 1528375808.0, + "85": 1528375808.0, + "86": 1528375808.0, + "87": 1528375808.0, + "88": 1528375808.0, + "89": 1528375808.0, + "90": 1528375808.0, + "91": 1528375808.0, + "92": 1528375808.0, + "93": 1528375808.0, + "94": 1528375808.0, + "95": 1528375808.0, + "96": 1528375808.0, + "97": 1528375808.0, + "98": 1528375808.0, + "99": 1528375808.0, + "100": 1528375808.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 4.30733, - "2": 0.54883, - "3": 0.40227, - "4": 0.3032, - "5": 0.22011, - "6": 0.21873, - "7": 0.21589, - "8": 0.21756, - "9": 0.2177, - "10": 0.21872, - "11": 0.23383, - "12": 0.21745, - "13": 0.21657, - "14": 0.21656, - "15": 0.21713, - "16": 0.21742, - "17": 0.21697, - "18": 0.21201, - "19": 0.21506, - "20": 0.2157, - "21": 0.21772, - "22": 0.21677, - "23": 0.21503, - "24": 0.21505, - "25": 0.21274, - "26": 0.21593, - "27": 0.21499, - "28": 0.21603, - "29": 0.21474, - "30": 0.21468, - "31": 0.21508, - "32": 0.21333, - "33": 0.21573, - "34": 0.21478, - "35": 0.21464, - "36": 0.21568, - "37": 0.21601, - "38": 0.21414, - "39": 0.21389, - "40": 0.21264, - "41": 0.21397, - "42": 0.21475, - "43": 0.21799, - "44": 0.21345, - "45": 0.21458, - "46": 0.21222, - "47": 0.2147, - "48": 0.21568, - "49": 0.21432, - "50": 0.21429, - "51": 0.30696, - "52": 0.26677, - "53": 0.22953, - "54": 0.24163, - "55": 0.25403, - "56": 0.26249, - "57": 0.21297, - "58": 0.21192, - "59": 0.20898, - "60": 0.21257, - "61": 0.21307, - "62": 0.21067, - "63": 0.21212, - "64": 0.21044, - "65": 0.21146, - "66": 0.21291, - "67": 0.21327, - "68": 0.21434, - "69": 0.21106, - "70": 0.21146, - "71": 0.21366, - "72": 0.21359, - "73": 0.21245, - "74": 0.21111, - "75": 0.21327, - "76": 0.21236, - "77": 0.21209, - "78": 0.21155, - "79": 0.2124, - "80": 0.21314, - "81": 0.21341, - "82": 0.21206, - "83": 0.21321, - "84": 0.21124, - "85": 0.21448, - "86": 0.21358, - "87": 0.21637, - "88": 0.21209, - "89": 0.21325, - "90": 0.2136, - "91": 0.21349, - "92": 0.20976, - "93": 0.21241, - "94": 0.21301, - "95": 0.21086, - "96": 0.21278, - "97": 0.21118, - "98": 0.21308, - "99": 0.21572, - "100": 0.21585 + "1": "nan", + "2": 1.71105, + "3": 0.22879, + "4": 0.22169, + "5": 0.21979, + "6": 0.21933, + "7": 0.21836, + "8": 0.22054, + "9": 0.22096, + "10": 0.22079, + "11": 0.22255, + "12": 0.21905, + "13": 0.22266, + "14": 0.22261, + "15": 0.22192, + "16": 0.21928, + "17": 0.22014, + "18": 0.2213, + "19": 0.22242, + "20": 0.22097, + "21": 0.21936, + "22": 0.22091, + "23": 0.22071, + "24": 0.22306, + "25": 0.22073, + "26": 0.22028, + "27": 0.22198, + "28": 0.22294, + "29": 0.22204, + "30": 0.21896, + "31": 0.22144, + "32": 0.22279, + "33": 0.22428, + "34": 0.22247, + "35": 0.22192, + "36": 0.22242, + "37": 0.22321, + "38": 0.22186, + "39": 0.22242, + "40": 0.22098, + "41": 0.22254, + "42": 0.55234, + "43": 0.22432, + "44": 0.22103, + "45": 0.22202, + "46": 0.2216, + "47": 0.22107, + "48": 0.21878, + "49": 0.22338, + "50": 0.22181, + "51": 0.22588, + "52": 0.22221, + "53": 0.22214, + "54": 0.22059, + "55": 0.22088, + "56": 0.22231, + "57": 0.2231, + "58": 0.22228, + "59": 0.22136, + "60": 0.22087, + "61": 0.22171, + "62": 0.22165, + "63": 0.22149, + "64": 0.22165, + "65": 0.22916, + "66": 0.25667, + "67": 0.22585, + "68": 0.2212, + "69": 0.22322, + "70": 0.22332, + "71": 0.22291, + "72": 0.22074, + "73": 0.2214, + "74": 0.22287, + "75": 0.21929, + "76": 0.22246, + "77": 0.22148, + "78": 0.22442, + "79": 0.22465, + "80": 0.22859, + "81": 0.22464, + "82": 0.22391, + "83": 0.22417, + "84": 0.22202, + "85": 0.22369, + "86": 0.22224, + "87": 0.22245, + "88": 0.22255, + "89": 0.22379, + "90": 0.22356, + "91": 0.22229, + "92": 0.22297, + "93": 0.22525, + "94": 0.21956, + "95": 0.22318, + "96": 0.22361, + "97": 0.22246, + "98": 0.22326, + "99": 0.22121, + "100": 0.22214 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json index 3264336647e..85eca8a168b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5.51145, - "2": 0.34832, - "3": 0.34015, - "4": 0.32824, - "5": 0.32875, - "6": 0.32954, - "7": 0.3278, - "8": 0.32782, - "9": 0.33548, - "10": 0.32705, - "11": 0.3306, - "12": 0.649, - "13": 0.32524, - "14": 0.32234, - "15": 0.32194, - "16": 0.32286, - "17": 0.32381, - "18": 0.32317, - "19": 0.32316, - "20": 0.32225, - "21": 0.32237, - "22": 0.32068, - "23": 0.31836, - "24": 0.32077, - "25": 0.32241, - "26": 0.3196, - "27": 0.32484, - "28": 0.3223, - "29": 0.32268, - "30": 0.31921, - "31": 0.31951, - "32": 0.31901, - "33": 0.31776, - "34": 0.31959, - "35": 0.32009, - "36": 0.32217, - "37": 0.31843, - "38": 0.32842, - "39": 0.31803, - "40": 0.32118, - "41": 0.67436, - "42": 0.32184, - "43": 0.31883, - "44": 0.31976, - "45": 0.64044, - "46": 0.38679, - "47": 0.37664, - "48": 0.3844, - "49": 0.38013, - "50": 0.38188 + "1": "nan", + "2": 2.48029, + "3": 0.33127, + "4": 0.31594, + "5": 0.32143, + "6": 0.31919, + "7": 0.31884, + "8": 0.32129, + "9": 0.31988, + "10": 0.32069, + "11": 0.31907, + "12": 0.31959, + "13": 0.32211, + "14": 0.31964, + "15": 0.31855, + "16": 0.32013, + "17": 0.32004, + "18": 0.31786, + "19": 0.31755, + "20": 0.31944, + "21": 0.31998, + "22": 0.32066, + "23": 0.32079, + "24": 0.31728, + "25": 0.31689, + "26": 0.31831, + "27": 0.31727, + "28": 0.31999, + "29": 0.31997, + "30": 0.31824, + "31": 0.31724, + "32": 0.33433, + "33": 0.43748, + "34": 0.63551, + "35": 0.35878, + "36": 0.31703, + "37": 0.31709, + "38": 0.32151, + "39": 0.31762, + "40": 0.3204, + "41": 0.3741, + "42": 0.37991, + "43": 0.3738, + "44": 0.38277, + "45": 0.38, + "46": 0.37409, + "47": 0.36543, + "48": 0.37113, + "49": 0.36281, + "50": 0.36274 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json index 4302879367b..c4a7b5d2ff0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.73516, - "2": 0.33146, - "3": 0.29953, - "4": 0.28786, - "5": 0.28898, - "6": 0.28943, - "7": 0.60486, - "8": 0.28771, - "9": 0.28863, - "10": 0.33495, - "11": 0.35979, - "12": 0.31826, - "13": 0.31158, - "14": 0.3553, - "15": 0.40525, - "16": 0.29653, - "17": 0.28954, - "18": 0.28746, - "19": 0.28594, - "20": 0.28918, - "21": 0.28811, - "22": 0.28994, - "23": 0.2878, - "24": 0.28704, - "25": 0.28786, - "26": 0.28829, - "27": 0.28723, - "28": 0.28842, - "29": 0.28755, - "30": 0.28856, - "31": 0.28778, - "32": 0.29729, - "33": 0.28622, - "34": 0.28852, - "35": 0.29006, - "36": 0.29076, - "37": 0.28535, - "38": 0.28783, - "39": 0.28843, - "40": 0.29078, - "41": 0.28844, - "42": 0.28652, - "43": 0.28742, - "44": 0.2859, - "45": 0.2849, - "46": 0.28877, - "47": 0.28739, - "48": 0.28758, - "49": 0.28616, - "50": 0.29116, - "51": 0.90295, - "52": 0.37657, - "53": 0.35642, - "54": 0.35986, - "55": 0.36134, - "56": 0.36573, - "57": 0.36411, - "58": 0.36481, - "59": 0.36464, - "60": 0.36272, - "61": 0.36512, - "62": 0.36724, - "63": 0.36476, - "64": 0.36594, - "65": 0.36724, - "66": 0.64822, - "67": 0.36581, - "68": 0.36271, - "69": 0.366, - "70": 0.36762, - "71": 0.36789, - "72": 0.64766, - "73": 0.36425, - "74": 0.36764, - "75": 0.3661, - "76": 0.36465, - "77": 0.36495, - "78": 0.36147, - "79": 0.36669, - "80": 0.36518, - "81": 0.36345, - "82": 0.36631, - "83": 0.36797, - "84": 0.36517, - "85": 0.36573, - "86": 0.36641, - "87": 0.36619, - "88": 0.3675, - "89": 0.3649, - "90": 0.36424, - "91": 0.36515, - "92": 0.36402, - "93": 0.3686, - "94": 0.36775, - "95": 0.36962, - "96": 0.36798, - "97": 0.36651, - "98": 0.36783, - "99": 0.36877, - "100": 0.36479 + "1": "nan", + "2": 2.39769, + "3": 0.36211, + "4": 0.34515, + "5": 0.34772, + "6": 0.34875, + "7": 0.33703, + "8": 0.33347, + "9": 0.33755, + "10": 0.3382, + "11": 0.34227, + "12": 0.34002, + "13": 0.34177, + "14": 0.34923, + "15": 0.34952, + "16": 0.34726, + "17": 0.34899, + "18": 0.34889, + "19": 0.3485, + "20": 0.34828, + "21": 0.35065, + "22": 0.35089, + "23": 0.34992, + "24": 0.34939, + "25": 0.34831, + "26": 0.35035, + "27": 0.3455, + "28": 0.34034, + "29": 0.352, + "30": 0.35218, + "31": 0.3474, + "32": 0.34883, + "33": 0.35242, + "34": 0.35219, + "35": 0.35394, + "36": 0.35651, + "37": 0.35447, + "38": 0.35431, + "39": 0.35341, + "40": 0.35274, + "41": 0.35275, + "42": 0.35259, + "43": 0.35167, + "44": 0.35288, + "45": 0.35066, + "46": 0.3519, + "47": 0.35146, + "48": 0.34993, + "49": 0.35082, + "50": 0.35229, + "51": 0.6173, + "52": 0.34392, + "53": 0.34427, + "54": 0.3476, + "55": 0.34816, + "56": 0.34515, + "57": 0.34404, + "58": 0.34542, + "59": 0.34546, + "60": 0.34635, + "61": 0.35023, + "62": 0.34884, + "63": 0.3484, + "64": 0.34644, + "65": 0.34943, + "66": 0.34821, + "67": 0.34706, + "68": 0.34645, + "69": 0.34888, + "70": 0.34562, + "71": 0.34952, + "72": 0.34911, + "73": 0.34968, + "74": 0.34895, + "75": 0.34861, + "76": 0.34704, + "77": 0.34924, + "78": 0.35302, + "79": 0.35161, + "80": 0.34618, + "81": 0.35136, + "82": 0.3518, + "83": 0.34829, + "84": 0.34739, + "85": 0.34831, + "86": 0.34725, + "87": 0.34629, + "88": 0.35011, + "89": 0.34978, + "90": 0.34956, + "91": 0.34919, + "92": 0.35021, + "93": 0.34979, + "94": 0.35425, + "95": 0.34862, + "96": 0.34704, + "97": 0.34718, + "98": 0.34842, + "99": 0.35045, + "100": 0.349 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..e8ad4bfea94 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.86104, + "52": 9.75109, + "53": 10.06631, + "54": 9.95634, + "55": 9.89354, + "56": 9.637, + "57": 9.49142, + "58": 9.8341, + "59": 9.5931, + "60": 9.51379, + "61": 9.69183, + "62": 9.99162, + "63": 9.39196, + "64": 9.77455, + "65": 8.96319, + "66": 9.70663, + "67": 9.3789, + "68": 9.78328, + "69": 9.79736, + "70": 9.73753, + "71": 9.62711, + "72": 9.58907, + "73": 9.50446, + "74": 8.94975, + "75": 9.4278, + "76": 9.08764, + "77": 10.06759, + "78": 9.72141, + "79": 9.3861, + "80": 9.40495, + "81": 9.48596, + "82": 9.70195, + "83": 9.31553, + "84": 9.41806, + "85": 9.61378, + "86": 9.08145, + "87": 9.59631, + "88": 9.75008, + "89": 9.60386, + "90": 9.82838, + "91": 9.33622, + "92": 9.35764, + "93": 9.08795, + "94": 8.83437, + "95": 9.53352, + "96": 9.53315, + "97": 9.31129, + "98": 9.67176, + "99": 8.89816, + "100": 9.40969 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2431.0, + "52": 2467.0, + "53": 2794.0, + "54": 2675.0, + "55": 2313.0, + "56": 2597.0, + "57": 2278.0, + "58": 2887.0, + "59": 2701.0, + "60": 2190.0, + "61": 2764.0, + "62": 2576.0, + "63": 2405.0, + "64": 2903.0, + "65": 2516.0, + "66": 2885.0, + "67": 2700.0, + "68": 2682.0, + "69": 2987.0, + "70": 3141.0, + "71": 3055.0, + "72": 2413.0, + "73": 2864.0, + "74": 1870.0, + "75": 2450.0, + "76": 3032.0, + "77": 3230.0, + "78": 3125.0, + "79": 2982.0, + "80": 3203.0, + "81": 3657.0, + "82": 3174.0, + "83": 2818.0, + "84": 3190.0, + "85": 3166.0, + "86": 2793.0, + "87": 3635.0, + "88": 3005.0, + "89": 3373.0, + "90": 3066.0, + "91": 2857.0, + "92": 3080.0, + "93": 2533.0, + "94": 3303.0, + "95": 3270.0, + "96": 3416.0, + "97": 3085.0, + "98": 3437.0, + "99": 3243.0, + "100": 3119.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 365196800.0, + "52": 365196800.0, + "53": 365196800.0, + "54": 365196800.0, + "55": 365196800.0, + "56": 365196800.0, + "57": 365196800.0, + "58": 365196800.0, + "59": 365196800.0, + "60": 365196800.0, + "61": 365196800.0, + "62": 365196800.0, + "63": 365196800.0, + "64": 365196800.0, + "65": 365196800.0, + "66": 365196800.0, + "67": 365196800.0, + "68": 365196800.0, + "69": 365196800.0, + "70": 365196800.0, + "71": 365196800.0, + "72": 365196800.0, + "73": 365196800.0, + "74": 365196800.0, + "75": 365196800.0, + "76": 365196800.0, + "77": 365196800.0, + "78": 365196800.0, + "79": 365196800.0, + "80": 365196800.0, + "81": 365196800.0, + "82": 365196800.0, + "83": 365196800.0, + "84": 365196800.0, + "85": 365196800.0, + "86": 365196800.0, + "87": 365196800.0, + "88": 365196800.0, + "89": 365196800.0, + "90": 365196800.0, + "91": 365196800.0, + "92": 365196800.0, + "93": 365196800.0, + "94": 365196800.0, + "95": 365196800.0, + "96": 365196800.0, + "97": 365196800.0, + "98": 365196800.0, + "99": 365196800.0, + "100": 365196800.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1132180992.0, + "52": 1132182016.0, + "53": 1132182016.0, + "54": 1132182016.0, + "55": 1132182016.0, + "56": 1132182016.0, + "57": 1132182016.0, + "58": 1132182016.0, + "59": 1132182016.0, + "60": 1132182016.0, + "61": 1132182016.0, + "62": 1132182016.0, + "63": 1132182016.0, + "64": 1132182016.0, + "65": 1132182016.0, + "66": 1132182016.0, + "67": 1132182016.0, + "68": 1132182016.0, + "69": 1132182016.0, + "70": 1132182016.0, + "71": 1132182016.0, + "72": 1132182016.0, + "73": 1132182016.0, + "74": 1132182016.0, + "75": 1132182016.0, + "76": 1132182016.0, + "77": 1132182016.0, + "78": 1132182016.0, + "79": 1132182016.0, + "80": 1132182016.0, + "81": 1132182016.0, + "82": 1132182016.0, + "83": 1132182016.0, + "84": 1132182016.0, + "85": 1132182016.0, + "86": 1132182016.0, + "87": 1132182016.0, + "88": 1132182016.0, + "89": 1132182016.0, + "90": 1132182016.0, + "91": 1132182016.0, + "92": 1132182016.0, + "93": 1132182016.0, + "94": 1132182016.0, + "95": 1132182016.0, + "96": 1132182016.0, + "97": 1132182016.0, + "98": 1132182016.0, + "99": 1132182016.0, + "100": 1132182016.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.25993, + "53": 0.33819, + "54": 0.33044, + "55": 0.3308, + "56": 0.32801, + "57": 0.33341, + "58": 0.33637, + "59": 0.33863, + "60": 0.33792, + "61": 0.33956, + "62": 0.34024, + "63": 0.33835, + "64": 0.34288, + "65": 0.33962, + "66": 0.34436, + "67": 0.34564, + "68": 0.34571, + "69": 0.34698, + "70": 0.34462, + "71": 0.34382, + "72": 0.3445, + "73": 0.3446, + "74": 0.34215, + "75": 0.34655, + "76": 0.34397, + "77": 0.34288, + "78": 0.34323, + "79": 0.34095, + "80": 0.34228, + "81": 0.34072, + "82": 0.34333, + "83": 0.34118, + "84": 0.34195, + "85": 0.34307, + "86": 0.34341, + "87": 0.34409, + "88": 0.34372, + "89": 0.34284, + "90": 0.34363, + "91": 0.347, + "92": 0.34448, + "93": 0.3445, + "94": 0.34642, + "95": 0.34511, + "96": 0.34515, + "97": 0.34484, + "98": 0.34543, + "99": 0.34503, + "100": 0.34832 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json index e52665efa28..e73d1df6682 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.708, - "2": 0.35089, - "3": 0.30496, - "4": 0.29651, - "5": 0.29659, - "6": 0.29472, - "7": 0.29503, - "8": 0.29691, - "9": 0.29322, - "10": 0.29593, - "11": 0.29761, - "12": 0.30107, - "13": 0.29648, - "14": 0.29634, - "15": 0.29469, - "16": 0.29524, - "17": 0.29548, - "18": 0.29571, - "19": 0.29611, - "20": 0.29461, - "21": 0.60922, - "22": 0.29063, - "23": 0.29253, - "24": 0.29221, - "25": 0.35076, - "26": 0.35448, - "27": 0.40801, - "28": 0.32376, - "29": 0.37315, - "30": 0.36741, - "31": 0.30484, - "32": 0.31503, - "33": 0.33111, - "34": 0.33501, - "35": 0.34146, - "36": 0.33794, - "37": 0.3366, - "38": 0.34, - "39": 0.38047, - "40": 0.34724, - "41": 0.34541, - "42": 0.34988, - "43": 0.34614, - "44": 0.34763, - "45": 0.34809, - "46": 0.3476, - "47": 0.34789, - "48": 0.34502, - "49": 0.34682, - "50": 0.34684, - "51": 0.32661, - "52": 0.30335, - "53": 0.30141, - "54": 0.30091, - "55": 0.30835, - "56": 0.30212, - "57": 0.29749, - "58": 0.29597, - "59": 0.29872, - "60": 0.29657, - "61": 0.2928, - "62": 0.29426, - "63": 0.29212, - "64": 0.29342, - "65": 0.2952, - "66": 0.30066, - "67": 0.32851, - "68": 0.32899, - "69": 0.30542, - "70": 0.29401, - "71": 0.2933, - "72": 0.2929, - "73": 0.29695, - "74": 0.29676, - "75": 0.2973, - "76": 0.29472, - "77": 0.29643, - "78": 0.29471, - "79": 0.29414, - "80": 0.29496, - "81": 0.2934, - "82": 0.2937, - "83": 0.29466, - "84": 0.29244, - "85": 0.29464, - "86": 0.29497, - "87": 0.29568, - "88": 0.29595, - "89": 0.29485, - "90": 0.29357, - "91": 0.29468, - "92": 0.29513, - "93": 0.29741, - "94": 0.29444, - "95": 0.29584, - "96": 0.29461, - "97": 0.29375, - "98": 0.29414, - "99": 0.29269, - "100": 0.29041 + "1": "nan", + "2": 2.50092, + "3": 0.30393, + "4": 0.296, + "5": 0.29464, + "6": 0.29386, + "7": 0.29621, + "8": 0.2946, + "9": 0.29682, + "10": 0.29745, + "11": 0.3056, + "12": 0.30475, + "13": 0.30581, + "14": 0.3052, + "15": 0.31033, + "16": 0.30534, + "17": 0.30586, + "18": 0.3053, + "19": 0.30668, + "20": 0.3062, + "21": 0.31086, + "22": 0.30673, + "23": 0.30645, + "24": 0.30648, + "25": 0.30922, + "26": 0.30442, + "27": 0.30196, + "28": 0.3042, + "29": 0.30389, + "30": 0.30468, + "31": 0.30661, + "32": 0.30468, + "33": 0.30645, + "34": 0.30588, + "35": 0.3037, + "36": 0.30433, + "37": 0.30504, + "38": 0.30676, + "39": 0.30639, + "40": 0.30854, + "41": 0.31017, + "42": 0.30559, + "43": 0.30359, + "44": 0.30728, + "45": 0.30737, + "46": 0.30728, + "47": 0.30866, + "48": 0.30981, + "49": 0.3097, + "50": 0.30633, + "51": 0.31798, + "52": 0.30466, + "53": 0.30302, + "54": 0.30516, + "55": 0.30263, + "56": 0.30315, + "57": 0.30305, + "58": 0.30451, + "59": 0.30443, + "60": 0.30525, + "61": 0.30503, + "62": 0.3063, + "63": 0.30517, + "64": 0.30552, + "65": 0.30685, + "66": 0.30584, + "67": 0.31593, + "68": 0.34589, + "69": 0.30682, + "70": 0.30582, + "71": 0.30682, + "72": 0.30578, + "73": 0.30496, + "74": 0.30689, + "75": 0.30927, + "76": 0.31024, + "77": 0.3125, + "78": 0.31093, + "79": 0.31106, + "80": 0.30717, + "81": 0.30815, + "82": 0.30914, + "83": 0.30911, + "84": 0.30335, + "85": 0.29792, + "86": 0.2997, + "87": 0.3032, + "88": 0.30139, + "89": 0.30675, + "90": 0.30412, + "91": 0.30454, + "92": 0.30497, + "93": 0.30233, + "94": 0.30714, + "95": 0.30673, + "96": 0.30193, + "97": 0.30472, + "98": 0.3103, + "99": 0.30957, + "100": 0.30828 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..235ce034813 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.86104, + "52": 9.75109, + "53": 10.06631, + "54": 9.95634, + "55": 9.89354, + "56": 9.637, + "57": 9.49142, + "58": 9.8341, + "59": 9.5931, + "60": 9.51379, + "61": 9.69183, + "62": 9.99162, + "63": 9.39196, + "64": 9.77455, + "65": 8.96319, + "66": 9.70663, + "67": 9.3789, + "68": 9.78328, + "69": 9.79736, + "70": 9.73753, + "71": 9.62711, + "72": 9.58907, + "73": 9.50446, + "74": 8.94975, + "75": 9.4278, + "76": 9.08764, + "77": 10.06759, + "78": 9.72141, + "79": 9.3861, + "80": 9.40495, + "81": 9.48596, + "82": 9.70195, + "83": 9.31553, + "84": 9.41806, + "85": 9.61378, + "86": 9.08145, + "87": 9.59631, + "88": 9.75008, + "89": 9.60386, + "90": 9.82838, + "91": 9.33622, + "92": 9.35764, + "93": 9.08795, + "94": 8.83437, + "95": 9.53352, + "96": 9.53315, + "97": 9.31129, + "98": 9.67176, + "99": 8.89816, + "100": 9.40969 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2431.0, + "52": 2467.0, + "53": 2794.0, + "54": 2675.0, + "55": 2313.0, + "56": 2597.0, + "57": 2278.0, + "58": 2887.0, + "59": 2701.0, + "60": 2190.0, + "61": 2764.0, + "62": 2576.0, + "63": 2405.0, + "64": 2903.0, + "65": 2516.0, + "66": 2885.0, + "67": 2700.0, + "68": 2682.0, + "69": 2987.0, + "70": 3141.0, + "71": 3055.0, + "72": 2413.0, + "73": 2864.0, + "74": 1870.0, + "75": 2450.0, + "76": 3032.0, + "77": 3230.0, + "78": 3125.0, + "79": 2982.0, + "80": 3203.0, + "81": 3657.0, + "82": 3174.0, + "83": 2818.0, + "84": 3190.0, + "85": 3166.0, + "86": 2793.0, + "87": 3635.0, + "88": 3005.0, + "89": 3373.0, + "90": 3066.0, + "91": 2857.0, + "92": 3080.0, + "93": 2533.0, + "94": 3303.0, + "95": 3270.0, + "96": 3416.0, + "97": 3085.0, + "98": 3437.0, + "99": 3243.0, + "100": 3119.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 299204096.0, + "52": 299204096.0, + "53": 299204096.0, + "54": 299204096.0, + "55": 299204096.0, + "56": 299204096.0, + "57": 299204096.0, + "58": 299204096.0, + "59": 299204096.0, + "60": 299204096.0, + "61": 299204096.0, + "62": 299204096.0, + "63": 299204096.0, + "64": 299204096.0, + "65": 299204096.0, + "66": 299204096.0, + "67": 299204096.0, + "68": 299204096.0, + "69": 299204096.0, + "70": 299204096.0, + "71": 299204096.0, + "72": 299204096.0, + "73": 299204096.0, + "74": 299204096.0, + "75": 299204096.0, + "76": 299204096.0, + "77": 299204096.0, + "78": 299204096.0, + "79": 299204096.0, + "80": 299204096.0, + "81": 299204096.0, + "82": 299204096.0, + "83": 299204096.0, + "84": 299204096.0, + "85": 299204096.0, + "86": 299204096.0, + "87": 299204096.0, + "88": 299204096.0, + "89": 299204096.0, + "90": 299204096.0, + "91": 299204096.0, + "92": 299204096.0, + "93": 299204096.0, + "94": 299204096.0, + "95": 299204096.0, + "96": 299204096.0, + "97": 299204096.0, + "98": 299204096.0, + "99": 299204096.0, + "100": 299204096.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1064091136.0, + "52": 1064092160.0, + "53": 1064092160.0, + "54": 1064092160.0, + "55": 1064092160.0, + "56": 1064092160.0, + "57": 1064092160.0, + "58": 1064092160.0, + "59": 1064092160.0, + "60": 1064092160.0, + "61": 1064092160.0, + "62": 1064092160.0, + "63": 1064092160.0, + "64": 1064092160.0, + "65": 1064092160.0, + "66": 1064092160.0, + "67": 1064092160.0, + "68": 1064092160.0, + "69": 1064092160.0, + "70": 1064092160.0, + "71": 1064092160.0, + "72": 1064092160.0, + "73": 1064092160.0, + "74": 1064092160.0, + "75": 1064092160.0, + "76": 1064092160.0, + "77": 1064092160.0, + "78": 1064092160.0, + "79": 1064092160.0, + "80": 1064092160.0, + "81": 1064092160.0, + "82": 1064092160.0, + "83": 1064092160.0, + "84": 1064092160.0, + "85": 1064092160.0, + "86": 1064092160.0, + "87": 1064092160.0, + "88": 1064092160.0, + "89": 1064092160.0, + "90": 1064092160.0, + "91": 1064092160.0, + "92": 1064092160.0, + "93": 1064092160.0, + "94": 1064092160.0, + "95": 1064092160.0, + "96": 1064092160.0, + "97": 1064092160.0, + "98": 1064092160.0, + "99": 1064092160.0, + "100": 1064092160.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.14914, + "53": 0.55064, + "54": 0.29762, + "55": 0.30112, + "56": 0.30035, + "57": 0.29809, + "58": 0.30057, + "59": 0.29654, + "60": 0.29579, + "61": 0.2954, + "62": 0.29808, + "63": 0.2968, + "64": 0.29639, + "65": 0.29803, + "66": 0.29816, + "67": 0.2978, + "68": 0.29666, + "69": 0.29937, + "70": 0.29987, + "71": 0.29726, + "72": 0.29888, + "73": 0.29841, + "74": 0.29818, + "75": 0.29888, + "76": 0.30018, + "77": 0.29543, + "78": 0.29515, + "79": 0.29942, + "80": 0.30103, + "81": 0.30071, + "82": 0.30152, + "83": 0.30277, + "84": 0.30368, + "85": 0.30349, + "86": 0.30411, + "87": 0.30141, + "88": 0.30339, + "89": 0.3072, + "90": 0.30468, + "91": 0.30297, + "92": 0.30317, + "93": 0.30255, + "94": 0.29992, + "95": 0.30116, + "96": 0.29306, + "97": 0.29403, + "98": 0.29399, + "99": 0.29473, + "100": 0.2958 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json index 4200e3b38a8..65fc98f8dd4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.65464, - "2": 0.60021, - "3": 0.56211, - "4": 0.81567, - "5": 0.51087, - "6": 0.51362, - "7": 0.50868, - "8": 0.51119, - "9": 0.51537, - "10": 0.51491, - "11": 0.51179, - "12": 0.51216, - "13": 0.51208, - "14": 0.52419, - "15": 0.85827, - "16": 0.51731, - "17": 0.51718, - "18": 0.51546, - "19": 0.51334, - "20": 0.5203, - "21": 0.51793, - "22": 0.52901, - "23": 0.51605, - "24": 0.51462, - "25": 0.51195, - "26": 0.50837, - "27": 0.85741, - "28": 0.5083, - "29": 0.50928, - "30": 0.50919, - "31": 0.51059, - "32": 0.5129, - "33": 0.51253, - "34": 0.51142, - "35": 0.50986, - "36": 0.51279, - "37": 0.50996, - "38": 0.50872, - "39": 0.51314, - "40": 0.53857, - "41": 0.87144, - "42": 0.53733, - "43": 0.82532, - "44": 0.50255, - "45": 0.50942, - "46": 0.73489, - "47": 0.82645, - "48": 0.50964, - "49": 0.5094, - "50": 0.51015, - "51": 0.51394, - "52": 0.50874, - "53": 0.51284, - "54": 0.52083, - "55": 0.50789, - "56": 0.49975, - "57": 0.49792, - "58": 0.51444, - "59": 0.51001, - "60": 0.50768, - "61": 0.51346, - "62": 0.51695, - "63": 0.51586, - "64": 0.51965, - "65": 0.52295, - "66": 0.51606, - "67": 0.50646, - "68": 0.51105, - "69": 0.50496, - "70": 0.50887, - "71": 0.51043, - "72": 0.51293, - "73": 0.52108, - "74": 0.51224, - "75": 0.51005, - "76": 0.51268, - "77": 0.51097, - "78": 0.50687, - "79": 0.50729, - "80": 0.5142, - "81": 0.54269, - "82": 0.5267, - "83": 0.51288, - "84": 0.5147, - "85": 0.52025, - "86": 0.52158, - "87": 0.51316, - "88": 0.5178, - "89": 0.55243, - "90": 0.51232, - "91": 0.51784, - "92": 0.5159, - "93": 0.51384, - "94": 0.51504, - "95": 0.51606, - "96": 0.5173, - "97": 0.51802, - "98": 0.51331, - "99": 0.51466, - "100": 0.51281 + "1": "nan", + "2": 2.73217, + "3": 0.44849, + "4": 0.46632, + "5": 0.47877, + "6": 0.48831, + "7": 0.48769, + "8": 0.50745, + "9": 0.48778, + "10": 0.49192, + "11": 0.49758, + "12": 0.5014, + "13": 0.49698, + "14": 0.49958, + "15": 0.49877, + "16": 0.50112, + "17": 0.49678, + "18": 0.49696, + "19": 0.49583, + "20": 0.49823, + "21": 0.50092, + "22": 0.50313, + "23": 0.50157, + "24": 0.50564, + "25": 0.50173, + "26": 0.50691, + "27": 0.5209, + "28": 0.51519, + "29": 0.50283, + "30": 0.50601, + "31": 0.50139, + "32": 0.507, + "33": 0.50335, + "34": 0.50467, + "35": 0.50168, + "36": 0.49771, + "37": 0.49868, + "38": 0.49794, + "39": 0.49729, + "40": 0.4917, + "41": 0.49294, + "42": 0.48867, + "43": 0.49291, + "44": 0.49762, + "45": 0.49672, + "46": 0.50694, + "47": 0.49816, + "48": 0.4942, + "49": 0.5031, + "50": 0.50121, + "51": 0.48839, + "52": 0.49123, + "53": 0.83615, + "54": 0.49979, + "55": 0.50032, + "56": 0.5025, + "57": 0.50465, + "58": 0.5032, + "59": 0.52509, + "60": 0.51125, + "61": 0.50912, + "62": 0.50722, + "63": 0.51052, + "64": 0.50743, + "65": 0.51588, + "66": 0.51203, + "67": 0.51526, + "68": 0.50806, + "69": 0.51012, + "70": 0.51073, + "71": 0.50805, + "72": 0.51001, + "73": 0.52219, + "74": 0.50785, + "75": 0.50971, + "76": 0.50837, + "77": 0.51328, + "78": 0.51109, + "79": 0.50795, + "80": 0.86855, + "81": 0.51135, + "82": 0.50858, + "83": 0.51273, + "84": 0.50989, + "85": 0.51087, + "86": 0.51808, + "87": 0.5247, + "88": 0.51417, + "89": 0.5201, + "90": 0.90988, + "91": 0.54215, + "92": 0.52369, + "93": 0.51835, + "94": 0.52068, + "95": 0.5186, + "96": 0.52052, + "97": 0.51882, + "98": 0.52061, + "99": 0.51758, + "100": 0.51114 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..074f4cc53b8 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.86714, + "52": 9.75686, + "53": 10.06973, + "54": 9.95909, + "55": 9.89872, + "56": 9.63952, + "57": 9.4936, + "58": 9.83608, + "59": 9.59679, + "60": 9.51626, + "61": 9.69468, + "62": 9.99033, + "63": 9.39041, + "64": 9.77374, + "65": 8.96559, + "66": 9.70319, + "67": 9.38057, + "68": 9.78256, + "69": 9.79804, + "70": 9.73697, + "71": 9.62634, + "72": 9.582, + "73": 9.50018, + "74": 8.93897, + "75": 9.42247, + "76": 9.08151, + "77": 10.06555, + "78": 9.71951, + "79": 9.38365, + "80": 9.4005, + "81": 9.48215, + "82": 9.69917, + "83": 9.30951, + "84": 9.41595, + "85": 9.61112, + "86": 9.07822, + "87": 9.59519, + "88": 9.74646, + "89": 9.60078, + "90": 9.82618, + "91": 9.32913, + "92": 9.35518, + "93": 9.08231, + "94": 8.83, + "95": 9.53112, + "96": 9.52889, + "97": 9.30954, + "98": 9.66956, + "99": 8.89675, + "100": 9.4083 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2492.0, + "52": 2583.0, + "53": 2788.0, + "54": 2678.0, + "55": 2301.0, + "56": 2724.0, + "57": 2272.0, + "58": 2999.0, + "59": 2686.0, + "60": 2330.0, + "61": 2852.0, + "62": 2703.0, + "63": 2277.0, + "64": 2990.0, + "65": 2475.0, + "66": 2892.0, + "67": 2646.0, + "68": 2650.0, + "69": 2845.0, + "70": 3145.0, + "71": 2913.0, + "72": 2573.0, + "73": 2850.0, + "74": 1865.0, + "75": 2466.0, + "76": 3055.0, + "77": 3185.0, + "78": 3106.0, + "79": 3053.0, + "80": 3184.0, + "81": 3447.0, + "82": 3296.0, + "83": 2726.0, + "84": 3276.0, + "85": 3336.0, + "86": 2803.0, + "87": 3643.0, + "88": 3013.0, + "89": 3185.0, + "90": 3126.0, + "91": 3076.0, + "92": 3139.0, + "93": 2665.0, + "94": 3302.0, + "95": 3282.0, + "96": 3404.0, + "97": 3215.0, + "98": 3465.0, + "99": 3128.0, + "100": 3231.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 397748736.0, + "52": 397748736.0, + "53": 397748736.0, + "54": 397748736.0, + "55": 397748736.0, + "56": 397748736.0, + "57": 397748736.0, + "58": 397748736.0, + "59": 397748736.0, + "60": 397748736.0, + "61": 397748736.0, + "62": 397748736.0, + "63": 397748736.0, + "64": 397748736.0, + "65": 397748736.0, + "66": 397748736.0, + "67": 397748736.0, + "68": 397748736.0, + "69": 397748736.0, + "70": 397748736.0, + "71": 397748736.0, + "72": 397748736.0, + "73": 397748736.0, + "74": 397748736.0, + "75": 397748736.0, + "76": 397748736.0, + "77": 397748736.0, + "78": 397748736.0, + "79": 397748736.0, + "80": 397748736.0, + "81": 397748736.0, + "82": 397748736.0, + "83": 397748736.0, + "84": 397748736.0, + "85": 397748736.0, + "86": 397748736.0, + "87": 397748736.0, + "88": 397748736.0, + "89": 397748736.0, + "90": 397748736.0, + "91": 397748736.0, + "92": 397748736.0, + "93": 397748736.0, + "94": 397748736.0, + "95": 397748736.0, + "96": 397748736.0, + "97": 397748736.0, + "98": 397748736.0, + "99": 397748736.0, + "100": 397748736.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1191471616.0, + "52": 1191472640.0, + "53": 1191472640.0, + "54": 1191472640.0, + "55": 1191472640.0, + "56": 1191472640.0, + "57": 1191472640.0, + "58": 1191472640.0, + "59": 1191472640.0, + "60": 1191472640.0, + "61": 1191472640.0, + "62": 1191472640.0, + "63": 1191472640.0, + "64": 1191472640.0, + "65": 1191472640.0, + "66": 1191472640.0, + "67": 1191472640.0, + "68": 1191472640.0, + "69": 1191472640.0, + "70": 1191472640.0, + "71": 1191472640.0, + "72": 1191472640.0, + "73": 1191472640.0, + "74": 1191472640.0, + "75": 1191472640.0, + "76": 1191472640.0, + "77": 1191472640.0, + "78": 1191472640.0, + "79": 1191472640.0, + "80": 1191472640.0, + "81": 1191472640.0, + "82": 1191472640.0, + "83": 1191472640.0, + "84": 1191472640.0, + "85": 1191472640.0, + "86": 1191472640.0, + "87": 1191472640.0, + "88": 1191472640.0, + "89": 1191472640.0, + "90": 1191472640.0, + "91": 1191472640.0, + "92": 1191472640.0, + "93": 1191472640.0, + "94": 1191472640.0, + "95": 1191472640.0, + "96": 1191472640.0, + "97": 1191472640.0, + "98": 1191472640.0, + "99": 1191472640.0, + "100": 1191472640.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.49404, + "53": 0.49738, + "54": 0.47517, + "55": 0.48073, + "56": 0.48124, + "57": 0.48531, + "58": 0.46905, + "59": 0.45198, + "60": 0.44375, + "61": 0.45318, + "62": 0.43328, + "63": 0.44116, + "64": 0.43199, + "65": 0.43219, + "66": 0.43466, + "67": 0.43576, + "68": 0.43222, + "69": 0.42871, + "70": 0.42889, + "71": 0.43506, + "72": 0.43141, + "73": 0.42482, + "74": 0.4278, + "75": 0.42933, + "76": 0.42676, + "77": 0.43206, + "78": 0.43106, + "79": 0.43328, + "80": 0.429, + "81": 0.4294, + "82": 0.43619, + "83": 0.42881, + "84": 0.44023, + "85": 0.43778, + "86": 0.4293, + "87": 0.42266, + "88": 0.43088, + "89": 0.4333, + "90": 0.42756, + "91": 0.42474, + "92": 0.43075, + "93": 0.43032, + "94": 0.42748, + "95": 0.43116, + "96": 0.43174, + "97": 0.42434, + "98": 0.42337, + "99": 0.42353, + "100": 0.4287 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json index 5b22c8f244c..771860e086a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json @@ -7,103 +7,103 @@ "1": 10.90251, "2": 10.9138, "3": 10.90169, - "4": 10.90724, - "5": 10.9045, - "6": 10.91656, - "7": 10.91268, + "4": 10.90727, + "5": 10.90448, + "6": 10.91653, + "7": 10.9127, "8": 10.89505, - "9": 10.91555, - "10": 10.87277, - "11": 10.90376, - "12": 10.90404, - "13": 10.91831, - "14": 10.90742, - "15": 10.87551, - "16": 10.85477, + "9": 10.91558, + "10": 10.87283, + "11": 10.90373, + "12": 10.904, + "13": 10.91828, + "14": 10.9074, + "15": 10.87552, + "16": 10.85479, "17": 10.83186, - "18": 10.84054, - "19": 10.84221, - "20": 10.75039, - "21": 10.73638, - "22": 10.62979, - "23": 10.74023, - "24": 10.64438, - "25": 10.60242, - "26": 10.64922, - "27": 10.64074, - "28": 10.58757, - "29": 10.59165, - "30": 10.38969, - "31": 10.18185, - "32": 10.49227, - "33": 10.48772, - "34": 10.26316, - "35": 10.2923, - "36": 10.25547, - "37": 10.37371, - "38": 10.2355, - "39": 10.42347, - "40": 10.10947, - "41": 10.17531, - "42": 10.2316, - "43": 9.87326, - "44": 9.9918, - "45": 9.86649, - "46": 9.84547, - "47": 10.17367, - "48": 9.87146, - "49": 9.55757, - "50": 9.92547, + "18": 10.84055, + "19": 10.84215, + "20": 10.75044, + "21": 10.73632, + "22": 10.62985, + "23": 10.74027, + "24": 10.64442, + "25": 10.60239, + "26": 10.64921, + "27": 10.64076, + "28": 10.5875, + "29": 10.59166, + "30": 10.38964, + "31": 10.18179, + "32": 10.49231, + "33": 10.48774, + "34": 10.26318, + "35": 10.29231, + "36": 10.25552, + "37": 10.37375, + "38": 10.23555, + "39": 10.42349, + "40": 10.10951, + "41": 10.1753, + "42": 10.23159, + "43": 9.87321, + "44": 9.99176, + "45": 9.8665, + "46": 9.84548, + "47": 10.17372, + "48": 9.87147, + "49": 9.55758, + "50": 9.92554, "51": 9.87398, "52": 9.76585, - "53": 10.08271, - "54": 9.97273, - "55": 9.90735, - "56": 9.64216, + "53": 10.0827, + "54": 9.97272, + "55": 9.90734, + "56": 9.64218, "57": 9.48857, - "58": 9.84273, + "58": 9.8427, "59": 9.60111, - "60": 9.52016, - "61": 9.70058, - "62": 9.99644, - "63": 9.39064, - "64": 9.77614, - "65": 8.96633, - "66": 9.70947, - "67": 9.3877, - "68": 9.78895, + "60": 9.52012, + "61": 9.70057, + "62": 9.99645, + "63": 9.39065, + "64": 9.77613, + "65": 8.96632, + "66": 9.70945, + "67": 9.38769, + "68": 9.78891, "69": 9.80803, "70": 9.74237, - "71": 9.63382, + "71": 9.63381, "72": 9.59118, - "73": 9.50694, - "74": 8.94248, - "75": 9.42903, - "76": 9.08836, - "77": 10.07155, - "78": 9.72684, - "79": 9.38725, - "80": 9.40572, - "81": 9.48703, + "73": 9.50696, + "74": 8.94245, + "75": 9.42902, + "76": 9.0883, + "77": 10.07151, + "78": 9.72685, + "79": 9.38721, + "80": 9.4057, + "81": 9.48702, "82": 9.70482, - "83": 9.31557, - "84": 9.42113, - "85": 9.61467, + "83": 9.31556, + "84": 9.42107, + "85": 9.61466, "86": 9.08461, - "87": 9.59903, - "88": 9.75369, - "89": 9.60597, - "90": 9.83153, - "91": 9.33877, - "92": 9.36033, - "93": 9.0904, - "94": 8.83712, + "87": 9.59902, + "88": 9.75368, + "89": 9.60598, + "90": 9.83154, + "91": 9.33878, + "92": 9.36034, + "93": 9.09038, + "94": 8.83714, "95": 9.53804, - "96": 9.53391, - "97": 9.31319, + "96": 9.5339, + "97": 9.31316, "98": 9.67422, - "99": 8.90345, - "100": 9.41498 + "99": 8.90347, + "100": 9.41497 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 1541.0, - "2": 1645.0, - "3": 1629.0, - "4": 1795.0, - "5": 1841.0, - "6": 1779.0, - "7": 1764.0, - "8": 1563.0, - "9": 1825.0, - "10": 1409.0, - "11": 1956.0, - "12": 1760.0, - "13": 1869.0, - "14": 1833.0, - "15": 1958.0, - "16": 1850.0, - "17": 1795.0, - "18": 1781.0, - "19": 1766.0, - "20": 1632.0, - "21": 1866.0, - "22": 1649.0, - "23": 1996.0, - "24": 1722.0, - "25": 1609.0, - "26": 1678.0, - "27": 1752.0, - "28": 1903.0, - "29": 1870.0, - "30": 1851.0, - "31": 1483.0, - "32": 1836.0, - "33": 2084.0, - "34": 1799.0, - "35": 1867.0, - "36": 1846.0, - "37": 2303.0, - "38": 2171.0, - "39": 2173.0, - "40": 2153.0, - "41": 2230.0, - "42": 2211.0, - "43": 1945.0, - "44": 2003.0, - "45": 2027.0, - "46": 2196.0, - "47": 2540.0, - "48": 2320.0, - "49": 2289.0, - "50": 2249.0, - "51": 2418.0, - "52": 2459.0, - "53": 2723.0, - "54": 2710.0, - "55": 2167.0, - "56": 2529.0, - "57": 2306.0, - "58": 2690.0, - "59": 2678.0, - "60": 2241.0, - "61": 2828.0, - "62": 2453.0, - "63": 2330.0, - "64": 2785.0, - "65": 2596.0, - "66": 2886.0, - "67": 2589.0, - "68": 2725.0, - "69": 2800.0, - "70": 3022.0, - "71": 2941.0, - "72": 2388.0, - "73": 2770.0, - "74": 1833.0, - "75": 2434.0, - "76": 2843.0, - "77": 3177.0, - "78": 3085.0, - "79": 3080.0, - "80": 3263.0, - "81": 3512.0, - "82": 3152.0, - "83": 2845.0, - "84": 3114.0, - "85": 3196.0, - "86": 2728.0, - "87": 3511.0, - "88": 2941.0, - "89": 3343.0, - "90": 3094.0, - "91": 2946.0, - "92": 3094.0, - "93": 2707.0, - "94": 3304.0, - "95": 3279.0, - "96": 3562.0, - "97": 2956.0, - "98": 3547.0, - "99": 3119.0, - "100": 3119.0 + "1": 1558.0, + "2": 1700.0, + "3": 1576.0, + "4": 1748.0, + "5": 1809.0, + "6": 1822.0, + "7": 1801.0, + "8": 1568.0, + "9": 1788.0, + "10": 1395.0, + "11": 1909.0, + "12": 1795.0, + "13": 1860.0, + "14": 1765.0, + "15": 1878.0, + "16": 1938.0, + "17": 1760.0, + "18": 1708.0, + "19": 1744.0, + "20": 1706.0, + "21": 1825.0, + "22": 1656.0, + "23": 1933.0, + "24": 1615.0, + "25": 1607.0, + "26": 1586.0, + "27": 1799.0, + "28": 1802.0, + "29": 1886.0, + "30": 1885.0, + "31": 1529.0, + "32": 1823.0, + "33": 1998.0, + "34": 1760.0, + "35": 1891.0, + "36": 1999.0, + "37": 2340.0, + "38": 2149.0, + "39": 2308.0, + "40": 2231.0, + "41": 2153.0, + "42": 2285.0, + "43": 1916.0, + "44": 2014.0, + "45": 1914.0, + "46": 2192.0, + "47": 2491.0, + "48": 2179.0, + "49": 2231.0, + "50": 2285.0, + "51": 2371.0, + "52": 2512.0, + "53": 2624.0, + "54": 2501.0, + "55": 2218.0, + "56": 2649.0, + "57": 2213.0, + "58": 2763.0, + "59": 2526.0, + "60": 2261.0, + "61": 2835.0, + "62": 2497.0, + "63": 2406.0, + "64": 2736.0, + "65": 2546.0, + "66": 2800.0, + "67": 2572.0, + "68": 2686.0, + "69": 2768.0, + "70": 2992.0, + "71": 2834.0, + "72": 2391.0, + "73": 2816.0, + "74": 1859.0, + "75": 2369.0, + "76": 2849.0, + "77": 3224.0, + "78": 3000.0, + "79": 3139.0, + "80": 3215.0, + "81": 3443.0, + "82": 3149.0, + "83": 2715.0, + "84": 3170.0, + "85": 3313.0, + "86": 2748.0, + "87": 3534.0, + "88": 3004.0, + "89": 3336.0, + "90": 3117.0, + "91": 2912.0, + "92": 3082.0, + "93": 2671.0, + "94": 3380.0, + "95": 3185.0, + "96": 3513.0, + "97": 3137.0, + "98": 3523.0, + "99": 3099.0, + "100": 3148.0 } }, "mem-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 700958208.0, - "2": 790286848.0, - "3": 790286848.0, - "4": 790286848.0, - "5": 790286848.0, - "6": 790286848.0, - "7": 790286848.0, - "8": 790286848.0, - "9": 790286848.0, - "10": 790286848.0, - "11": 790286848.0, - "12": 790286848.0, - "13": 790286848.0, - "14": 790286848.0, - "15": 790286848.0, - "16": 790286848.0, - "17": 790286848.0, - "18": 790286848.0, - "19": 790286848.0, - "20": 790286848.0, - "21": 790286848.0, - "22": 790286848.0, - "23": 790286848.0, - "24": 790286848.0, - "25": 790286848.0, - "26": 790286848.0, - "27": 790286848.0, - "28": 790286848.0, - "29": 790286848.0, - "30": 790286848.0, - "31": 790286848.0, - "32": 790286848.0, - "33": 790286848.0, - "34": 790286848.0, - "35": 790286848.0, - "36": 790286848.0, - "37": 790286848.0, - "38": 790286848.0, - "39": 790286848.0, - "40": 790286848.0, - "41": 790286848.0, - "42": 790286848.0, - "43": 790286848.0, - "44": 790286848.0, - "45": 790286848.0, - "46": 790286848.0, - "47": 790286848.0, - "48": 790286848.0, - "49": 790286848.0, - "50": 790286848.0, - "51": 790286848.0, - "52": 790286848.0, - "53": 790286848.0, - "54": 790286848.0, - "55": 790286848.0, - "56": 790286848.0, - "57": 790286848.0, - "58": 790286848.0, - "59": 790286848.0, - "60": 790286848.0, - "61": 790286848.0, - "62": 790286848.0, - "63": 790286848.0, - "64": 790286848.0, - "65": 790286848.0, - "66": 790286848.0, - "67": 790286848.0, - "68": 790286848.0, - "69": 790286848.0, - "70": 790286848.0, - "71": 790286848.0, - "72": 790286848.0, - "73": 790286848.0, - "74": 790286848.0, - "75": 790286848.0, - "76": 790286848.0, - "77": 790286848.0, - "78": 790286848.0, - "79": 790286848.0, - "80": 790286848.0, - "81": 790286848.0, - "82": 790286848.0, - "83": 790286848.0, - "84": 790286848.0, - "85": 790286848.0, - "86": 790286848.0, - "87": 790286848.0, - "88": 790286848.0, - "89": 790286848.0, - "90": 790286848.0, - "91": 790286848.0, - "92": 790286848.0, - "93": 790286848.0, - "94": 790286848.0, - "95": 790286848.0, - "96": 790286848.0, - "97": 790286848.0, - "98": 790286848.0, - "99": 790286848.0, - "100": 790286848.0 + "1": 704366080.0, + "2": 794219008.0, + "3": 794219008.0, + "4": 794219008.0, + "5": 794219008.0, + "6": 794219008.0, + "7": 794219008.0, + "8": 794219008.0, + "9": 794219008.0, + "10": 794219008.0, + "11": 794219008.0, + "12": 794219008.0, + "13": 794219008.0, + "14": 794219008.0, + "15": 794219008.0, + "16": 794219008.0, + "17": 794219008.0, + "18": 794219008.0, + "19": 794219008.0, + "20": 794219008.0, + "21": 794219008.0, + "22": 794219008.0, + "23": 794219008.0, + "24": 794219008.0, + "25": 794219008.0, + "26": 794219008.0, + "27": 794219008.0, + "28": 794219008.0, + "29": 794219008.0, + "30": 794219008.0, + "31": 794219008.0, + "32": 794219008.0, + "33": 794219008.0, + "34": 794219008.0, + "35": 794219008.0, + "36": 794219008.0, + "37": 794219008.0, + "38": 794219008.0, + "39": 794219008.0, + "40": 794219008.0, + "41": 794219008.0, + "42": 794219008.0, + "43": 794219008.0, + "44": 794219008.0, + "45": 794219008.0, + "46": 794219008.0, + "47": 794219008.0, + "48": 794219008.0, + "49": 794219008.0, + "50": 794219008.0, + "51": 794219008.0, + "52": 794219008.0, + "53": 794219008.0, + "54": 794219008.0, + "55": 794219008.0, + "56": 794219008.0, + "57": 794219008.0, + "58": 794219008.0, + "59": 794219008.0, + "60": 794219008.0, + "61": 794219008.0, + "62": 794219008.0, + "63": 794219008.0, + "64": 794219008.0, + "65": 794219008.0, + "66": 794219008.0, + "67": 794219008.0, + "68": 794219008.0, + "69": 794219008.0, + "70": 794219008.0, + "71": 794219008.0, + "72": 794219008.0, + "73": 794219008.0, + "74": 794219008.0, + "75": 794219008.0, + "76": 794219008.0, + "77": 794219008.0, + "78": 794219008.0, + "79": 794219008.0, + "80": 794219008.0, + "81": 794219008.0, + "82": 794219008.0, + "83": 794219008.0, + "84": 794219008.0, + "85": 794219008.0, + "86": 794219008.0, + "87": 794219008.0, + "88": 794219008.0, + "89": 794219008.0, + "90": 794219008.0, + "91": 794219008.0, + "92": 794219008.0, + "93": 794219008.0, + "94": 794219008.0, + "95": 794219008.0, + "96": 794219008.0, + "97": 794219008.0, + "98": 794219008.0, + "99": 794219008.0, + "100": 794219008.0 } }, "iteration-time": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 11.14668, - "2": 0.32935, - "3": 0.46923, - "4": 0.29529, - "5": 0.40702, - "6": 0.40156, - "7": 0.43578, - "8": 0.31407, - "9": 0.43033, - "10": 0.31604, - "11": 0.31738, - "12": 0.31563, - "13": 0.36997, - "14": 0.4371, - "15": 0.35906, - "16": 0.31709, - "17": 0.39045, - "18": 0.31331, - "19": 0.3763, - "20": 0.33238, - "21": 0.31767, - "22": 0.43702, - "23": 0.39383, - "24": 0.3148, - "25": 0.31554, - "26": 0.3135, - "27": 0.34957, - "28": 0.31621, - "29": 0.31661, - "30": 0.31507, - "31": 0.41199, - "32": 0.40737, - "33": 0.31355, - "34": 0.31358, - "35": 0.31287, - "36": 0.31491, - "37": 0.36356, - "38": 0.37435, - "39": 0.33637, - "40": 0.31406, - "41": 0.31613, - "42": 0.35153, - "43": 0.3142, - "44": 0.31623, - "45": 0.31572, - "46": 0.34532, - "47": 0.35769, - "48": 0.36855, - "49": 0.31459, - "50": 0.3144, - "51": 0.32345, - "52": 0.30594, - "53": 0.3111, - "54": 0.31377, - "55": 0.39254, - "56": 0.40899, - "57": 0.48809, - "58": 0.31709, - "59": 0.31541, - "60": 0.3139, - "61": 0.42195, - "62": 0.31636, - "63": 0.31499, - "64": 0.31608, - "65": 0.31718, - "66": 0.31606, - "67": 0.348, - "68": 0.39663, - "69": 0.31776, - "70": 0.31679, - "71": 0.31563, - "72": 0.3148, - "73": 0.31785, - "74": 0.36067, - "75": 0.31679, - "76": 0.31667, - "77": 0.40594, - "78": 0.31863, - "79": 0.31973, - "80": 0.31848, - "81": 0.31801, - "82": 0.31661, - "83": 0.3166, - "84": 0.49879, - "85": 0.31644, - "86": 0.31582, - "87": 0.31672, - "88": 0.31561, - "89": 0.3413, - "90": 0.3984, - "91": 0.31512, - "92": 0.39228, - "93": 0.31251, - "94": 0.311, - "95": 0.31228, - "96": 0.31391, - "97": 0.31003, - "98": 0.31573, - "99": 0.3154, - "100": 0.40105 + "1": "nan", + "2": 3.24816, + "3": 0.43688, + "4": 0.27843, + "5": 0.27555, + "6": 0.27709, + "7": 0.277, + "8": 0.27722, + "9": 0.27493, + "10": 0.277, + "11": 0.27605, + "12": 0.27617, + "13": 0.27539, + "14": 0.27709, + "15": 0.27461, + "16": 0.27313, + "17": 0.27396, + "18": 0.27419, + "19": 0.2729, + "20": 0.27386, + "21": 0.27451, + "22": 0.27496, + "23": 0.27291, + "24": 0.27491, + "25": 0.2752, + "26": 0.27531, + "27": 0.27661, + "28": 0.27544, + "29": 0.27432, + "30": 0.27338, + "31": 0.2738, + "32": 0.27312, + "33": 0.2732, + "34": 0.27439, + "35": 0.2727, + "36": 0.27455, + "37": 0.27368, + "38": 0.27316, + "39": 0.29032, + "40": 0.27694, + "41": 0.27622, + "42": 0.28477, + "43": 0.27626, + "44": 0.27624, + "45": 0.27486, + "46": 0.27565, + "47": 0.2747, + "48": 0.2742, + "49": 0.2754, + "50": 0.27741, + "51": 0.30004, + "52": 0.27365, + "53": 0.27134, + "54": 0.27213, + "55": 0.27342, + "56": 0.27158, + "57": 0.27123, + "58": 0.27216, + "59": 0.27272, + "60": 0.27106, + "61": 0.2721, + "62": 0.27338, + "63": 0.2716, + "64": 0.27194, + "65": 0.27219, + "66": 0.27183, + "67": 0.2734, + "68": 0.27398, + "69": 0.27633, + "70": 0.27236, + "71": 0.27322, + "72": 0.27105, + "73": 0.27181, + "74": 0.27247, + "75": 0.27172, + "76": 0.27237, + "77": 0.2696, + "78": 0.2681, + "79": 0.26821, + "80": 0.26803, + "81": 0.27079, + "82": 0.27045, + "83": 0.27549, + "84": 0.27227, + "85": 0.27313, + "86": 0.27337, + "87": 0.27499, + "88": 0.2754, + "89": 0.2753, + "90": 0.2744, + "91": 0.27474, + "92": 0.27214, + "93": 0.27687, + "94": 0.27473, + "95": 0.27478, + "96": 0.27394, + "97": 0.27801, + "98": 0.27283, + "99": 0.27237, + "100": 0.27512 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..3c3511a921b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 9.87398, + "52": 9.76584, + "53": 10.08272, + "54": 9.97273, + "55": 9.90736, + "56": 9.64213, + "57": 9.48856, + "58": 9.84268, + "59": 9.60111, + "60": 9.52013, + "61": 9.70058, + "62": 9.99642, + "63": 9.39067, + "64": 9.77612, + "65": 8.96637, + "66": 9.70949, + "67": 9.38771, + "68": 9.78893, + "69": 9.808, + "70": 9.74238, + "71": 9.63382, + "72": 9.59116, + "73": 9.50694, + "74": 8.94251, + "75": 9.42898, + "76": 9.0883, + "77": 10.07153, + "78": 9.72682, + "79": 9.38722, + "80": 9.40571, + "81": 9.48701, + "82": 9.70484, + "83": 9.31557, + "84": 9.42111, + "85": 9.61463, + "86": 9.08465, + "87": 9.59904, + "88": 9.75367, + "89": 9.606, + "90": 9.83155, + "91": 9.3388, + "92": 9.36037, + "93": 9.09036, + "94": 8.83711, + "95": 9.53804, + "96": 9.53392, + "97": 9.3132, + "98": 9.67422, + "99": 8.90347, + "100": 9.415 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2423.0, + "52": 2500.0, + "53": 2764.0, + "54": 2630.0, + "55": 2192.0, + "56": 2558.0, + "57": 2261.0, + "58": 2856.0, + "59": 2691.0, + "60": 2271.0, + "61": 2730.0, + "62": 2517.0, + "63": 2411.0, + "64": 2842.0, + "65": 2476.0, + "66": 2911.0, + "67": 2596.0, + "68": 2658.0, + "69": 2791.0, + "70": 3035.0, + "71": 2882.0, + "72": 2367.0, + "73": 2798.0, + "74": 1871.0, + "75": 2433.0, + "76": 2936.0, + "77": 3145.0, + "78": 2937.0, + "79": 2949.0, + "80": 3208.0, + "81": 3626.0, + "82": 3215.0, + "83": 2746.0, + "84": 3128.0, + "85": 3291.0, + "86": 2686.0, + "87": 3535.0, + "88": 2983.0, + "89": 3431.0, + "90": 3105.0, + "91": 2840.0, + "92": 3101.0, + "93": 2561.0, + "94": 3334.0, + "95": 3249.0, + "96": 3468.0, + "97": 3077.0, + "98": 3515.0, + "99": 3067.0, + "100": 3131.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 261759488.0, + "52": 261759488.0, + "53": 261759488.0, + "54": 261759488.0, + "55": 261759488.0, + "56": 261759488.0, + "57": 261759488.0, + "58": 261759488.0, + "59": 261759488.0, + "60": 261759488.0, + "61": 261759488.0, + "62": 261759488.0, + "63": 261759488.0, + "64": 261759488.0, + "65": 261759488.0, + "66": 261759488.0, + "67": 261759488.0, + "68": 261759488.0, + "69": 261759488.0, + "70": 261759488.0, + "71": 261759488.0, + "72": 261759488.0, + "73": 261759488.0, + "74": 261759488.0, + "75": 261759488.0, + "76": 261759488.0, + "77": 261759488.0, + "78": 261759488.0, + "79": 261759488.0, + "80": 261759488.0, + "81": 261759488.0, + "82": 261759488.0, + "83": 261759488.0, + "84": 261759488.0, + "85": 261759488.0, + "86": 261759488.0, + "87": 261759488.0, + "88": 261759488.0, + "89": 261759488.0, + "90": 261759488.0, + "91": 261759488.0, + "92": 261759488.0, + "93": 261759488.0, + "94": 261759488.0, + "95": 261759488.0, + "96": 261759488.0, + "97": 261759488.0, + "98": 261759488.0, + "99": 261759488.0, + "100": 261759488.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 795135488.0, + "52": 795136512.0, + "53": 795136512.0, + "54": 795136512.0, + "55": 795136512.0, + "56": 795136512.0, + "57": 795136512.0, + "58": 795136512.0, + "59": 795136512.0, + "60": 795136512.0, + "61": 795136512.0, + "62": 795136512.0, + "63": 795136512.0, + "64": 795136512.0, + "65": 795136512.0, + "66": 795136512.0, + "67": 795136512.0, + "68": 795136512.0, + "69": 795136512.0, + "70": 795136512.0, + "71": 795136512.0, + "72": 795136512.0, + "73": 795136512.0, + "74": 795136512.0, + "75": 795136512.0, + "76": 795136512.0, + "77": 795136512.0, + "78": 795136512.0, + "79": 795136512.0, + "80": 795136512.0, + "81": 795136512.0, + "82": 795136512.0, + "83": 795136512.0, + "84": 795136512.0, + "85": 795136512.0, + "86": 795136512.0, + "87": 795136512.0, + "88": 795136512.0, + "89": 795136512.0, + "90": 795136512.0, + "91": 795136512.0, + "92": 795136512.0, + "93": 795136512.0, + "94": 795136512.0, + "95": 795136512.0, + "96": 795136512.0, + "97": 795136512.0, + "98": 795136512.0, + "99": 795136512.0, + "100": 795136512.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 3.36268, + "53": 0.28919, + "54": 0.2725, + "55": 0.27972, + "56": 0.2728, + "57": 0.27382, + "58": 0.27288, + "59": 0.27294, + "60": 0.27575, + "61": 0.27075, + "62": 0.27057, + "63": 0.27211, + "64": 0.26991, + "65": 0.27298, + "66": 0.27045, + "67": 0.27231, + "68": 0.27315, + "69": 0.26969, + "70": 0.27037, + "71": 0.27028, + "72": 0.27191, + "73": 0.2714, + "74": 0.27082, + "75": 0.2722, + "76": 0.27153, + "77": 0.27331, + "78": 0.27142, + "79": 0.27368, + "80": 0.27144, + "81": 0.26895, + "82": 0.27139, + "83": 0.26946, + "84": 0.27033, + "85": 0.2702, + "86": 0.26955, + "87": 0.2686, + "88": 0.27213, + "89": 0.2709, + "90": 0.27061, + "91": 0.27274, + "92": 0.26989, + "93": 0.27031, + "94": 0.27054, + "95": 0.269, + "96": 0.27187, + "97": 0.26915, + "98": 0.2696, + "99": 0.27075, + "100": 0.26802 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json index 3d9cf662b8f..12b113ac52d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 5.23658, - "2": 0.11342, - "3": 0.10424, - "4": 0.0896, - "5": 0.08891, - "6": 0.08841, - "7": 0.0882, - "8": 0.08856, - "9": 0.08635, - "10": 0.08776, - "11": 0.08701, - "12": 0.08694, - "13": 0.08552, - "14": 0.08635, - "15": 0.08742, - "16": 0.08423, - "17": 0.08309, - "18": 0.11719, - "19": 0.0929, - "20": 0.1101, - "21": 0.08669, - "22": 0.08719, - "23": 0.08582, - "24": 0.08654, - "25": 0.08603, - "26": 0.08535, - "27": 0.08439, - "28": 0.08545, - "29": 0.08496, - "30": 0.08412, - "31": 0.08316, - "32": 0.08329, - "33": 0.08342, - "34": 0.08511, - "35": 0.0834, - "36": 0.08316, - "37": 0.08223, - "38": 0.08202, - "39": 0.08221, - "40": 0.07703, - "41": 0.08264, - "42": 0.08192, - "43": 0.0814, - "44": 0.08107, - "45": 0.08098, - "46": 0.08419, - "47": 0.08114, - "48": 0.22032, - "49": 0.0833, - "50": 0.08014, - "51": 0.10352, - "52": 0.08063, - "53": 0.07904, - "54": 0.08003, - "55": 0.08622, - "56": 0.08065, - "57": 0.08879, - "58": 0.08111, - "59": 0.08093, - "60": 0.08098, - "61": 0.08226, - "62": 0.08281, - "63": 0.08189, - "64": 0.08714, - "65": 0.08455, - "66": 0.0857, - "67": 0.08236, - "68": 0.08336, - "69": 0.08227, - "70": 0.0833, - "71": 0.08157, - "72": 0.08485, - "73": 0.08177, - "74": 0.08349, - "75": 0.0828, - "76": 0.08429, - "77": 0.08256, - "78": 0.08362, - "79": 0.08272, - "80": 0.08394, - "81": 0.08197, - "82": 0.08345, - "83": 0.08164, - "84": 0.08343, - "85": 0.08257, - "86": 0.08443, - "87": 0.08437, - "88": 0.08308, - "89": 0.08326, - "90": 0.08136, - "91": 0.08197, - "92": 0.08322, - "93": 0.08598, - "94": 0.08404, - "95": 0.08296, - "96": 0.08331, - "97": 0.08342, - "98": 0.08389, - "99": 0.0902, - "100": 0.09282 + "1": "nan", + "2": 2.48216, + "3": 0.10038, + "4": 0.08588, + "5": 0.08467, + "6": 0.08488, + "7": 0.08474, + "8": 0.08597, + "9": 0.0851, + "10": 0.08607, + "11": 0.08606, + "12": 0.08597, + "13": 0.08652, + "14": 0.08774, + "15": 0.08628, + "16": 0.0847, + "17": 0.08455, + "18": 0.13013, + "19": 0.09834, + "20": 0.09375, + "21": 0.09358, + "22": 0.09463, + "23": 0.094, + "24": 0.09339, + "25": 0.09356, + "26": 0.09394, + "27": 0.095, + "28": 0.09502, + "29": 0.09472, + "30": 0.0953, + "31": 0.09574, + "32": 0.09524, + "33": 0.09617, + "34": 0.09524, + "35": 0.09477, + "36": 0.09409, + "37": 0.09554, + "38": 0.09481, + "39": 0.09427, + "40": 0.08957, + "41": 0.0952, + "42": 0.09493, + "43": 0.09445, + "44": 0.09424, + "45": 0.09619, + "46": 0.09583, + "47": 0.09462, + "48": 0.09189, + "49": 0.09344, + "50": 0.09111, + "51": 0.09793, + "52": 0.08604, + "53": 0.08487, + "54": 0.08374, + "55": 0.0848, + "56": 0.08313, + "57": 0.08774, + "58": 0.08284, + "59": 0.08383, + "60": 0.08368, + "61": 0.08436, + "62": 0.08368, + "63": 0.08273, + "64": 0.08275, + "65": 0.0846, + "66": 0.08337, + "67": 0.08515, + "68": 0.08341, + "69": 0.08418, + "70": 0.08487, + "71": 0.08388, + "72": 0.08281, + "73": 0.08364, + "74": 0.0827, + "75": 0.08268, + "76": 0.08269, + "77": 0.08411, + "78": 0.09377, + "79": 0.08743, + "80": 0.08891, + "81": 0.08977, + "82": 0.0911, + "83": 0.09108, + "84": 0.09091, + "85": 0.09179, + "86": 0.09209, + "87": 0.09134, + "88": 0.09198, + "89": 0.09153, + "90": 0.09199, + "91": 0.09404, + "92": 0.09022, + "93": 0.09001, + "94": 0.09097, + "95": 0.09188, + "96": 0.09181, + "97": 0.09309, + "98": 0.0924, + "99": 0.09355, + "100": 0.09423 } }, "num-zeros": { diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..f9d44558b50 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 10.20107, + "52": 10.10861, + "53": 10.3403, + "54": 10.23893, + "55": 10.19008, + "56": 9.96159, + "57": 9.82417, + "58": 10.10904, + "59": 9.9041, + "60": 9.82045, + "61": 9.96789, + "62": 10.19934, + "63": 9.66196, + "64": 10.00416, + "65": 9.2675, + "66": 9.92466, + "67": 9.62367, + "68": 9.98499, + "69": 9.98524, + "70": 9.92553, + "71": 9.81785, + "72": 9.77816, + "73": 9.67402, + "74": 9.16615, + "75": 9.59935, + "76": 9.2754, + "77": 10.18639, + "78": 9.86592, + "79": 9.52838, + "80": 9.55132, + "81": 9.63037, + "82": 9.82843, + "83": 9.47009, + "84": 9.5424, + "85": 9.74228, + "86": 9.20711, + "87": 9.70433, + "88": 9.86745, + "89": 9.72062, + "90": 9.9304, + "91": 9.471, + "92": 9.47539, + "93": 9.21193, + "94": 8.94879, + "95": 9.62951, + "96": 9.63936, + "97": 9.40708, + "98": 9.77232, + "99": 9.01139, + "100": 9.51718 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1418.0, + "52": 1355.0, + "53": 1814.0, + "54": 1567.0, + "55": 1492.0, + "56": 1408.0, + "57": 1401.0, + "58": 1724.0, + "59": 1654.0, + "60": 1416.0, + "61": 1780.0, + "62": 1852.0, + "63": 1560.0, + "64": 1837.0, + "65": 1520.0, + "66": 1649.0, + "67": 1660.0, + "68": 1716.0, + "69": 1815.0, + "70": 2017.0, + "71": 2026.0, + "72": 1579.0, + "73": 1962.0, + "74": 1321.0, + "75": 1782.0, + "76": 1942.0, + "77": 2128.0, + "78": 2057.0, + "79": 1905.0, + "80": 2153.0, + "81": 2320.0, + "82": 2468.0, + "83": 1951.0, + "84": 2184.0, + "85": 2301.0, + "86": 1971.0, + "87": 2900.0, + "88": 2175.0, + "89": 2357.0, + "90": 2515.0, + "91": 1929.0, + "92": 2680.0, + "93": 2160.0, + "94": 2213.0, + "95": 2280.0, + "96": 2563.0, + "97": 2522.0, + "98": 2470.0, + "99": 2266.0, + "100": 2099.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, + "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, + "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, + "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, + "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, + "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, + "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, + "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, + "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, + "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, + "100": 518880768.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 4575260160.0, + "52": 4608815616.0, + "53": 4608815616.0, + "54": 4608815616.0, + "55": 4608815616.0, + "56": 4608815616.0, + "57": 4608815616.0, + "58": 4608815616.0, + "59": 4608815616.0, + "60": 4608815616.0, + "61": 4608815616.0, + "62": 4608815616.0, + "63": 4608815616.0, + "64": 4608815616.0, + "65": 4608815616.0, + "66": 4608815616.0, + "67": 4608815616.0, + "68": 4608815616.0, + "69": 4608815616.0, + "70": 4608815616.0, + "71": 4608815616.0, + "72": 4608815616.0, + "73": 4608815616.0, + "74": 4608815616.0, + "75": 4608815616.0, + "76": 4608815616.0, + "77": 4608815616.0, + "78": 4608815616.0, + "79": 4608815616.0, + "80": 4608815616.0, + "81": 4608815616.0, + "82": 4608815616.0, + "83": 4608815616.0, + "84": 4608815616.0, + "85": 4608815616.0, + "86": 4608815616.0, + "87": 4608815616.0, + "88": 4608815616.0, + "89": 4608815616.0, + "90": 4608815616.0, + "91": 4608815616.0, + "92": 4608815616.0, + "93": 4608815616.0, + "94": 4608815616.0, + "95": 4608815616.0, + "96": 4608815616.0, + "97": 4608815616.0, + "98": 4608815616.0, + "99": 4608815616.0, + "100": 4608815616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.23866, + "53": 0.10658, + "54": 0.0966, + "55": 0.09594, + "56": 0.0952, + "57": 0.09585, + "58": 0.09524, + "59": 0.09415, + "60": 0.09333, + "61": 0.09407, + "62": 0.09371, + "63": 0.09249, + "64": 0.09383, + "65": 0.09373, + "66": 0.09367, + "67": 0.09283, + "68": 0.09259, + "69": 0.09221, + "70": 0.09229, + "71": 0.09427, + "72": 0.09373, + "73": 0.09768, + "74": 0.09797, + "75": 0.09776, + "76": 0.09553, + "77": 0.09265, + "78": 0.09359, + "79": 0.09433, + "80": 0.09424, + "81": 0.09558, + "82": 0.09536, + "83": 0.09601, + "84": 0.09284, + "85": 0.09339, + "86": 0.09417, + "87": 0.09462, + "88": 0.09587, + "89": 0.09335, + "90": 0.0933, + "91": 0.09312, + "92": 0.09369, + "93": 0.0928, + "94": 0.09412, + "95": 0.09343, + "96": 0.09295, + "97": 0.09368, + "98": 0.09289, + "99": 0.09643, + "100": 0.09451 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json index 51ebcb618e4..063c93b8168 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7.39667, - "2": 0.15731, - "3": 0.14531, - "4": 0.13151, - "5": 0.13018, - "6": 0.12885, - "7": 0.13069, - "8": 0.13596, - "9": 0.12969, - "10": 0.12994, - "11": 0.1314, - "12": 0.12886, - "13": 0.13009, - "14": 0.1305, - "15": 0.13493, - "16": 0.13341, - "17": 0.13216, - "18": 0.13045, - "19": 0.1359, - "20": 0.13207, - "21": 0.13248, - "22": 0.12979, - "23": 0.12948, - "24": 0.13047, - "25": 0.12963, - "26": 0.13113, - "27": 0.13172, - "28": 0.14017, - "29": 0.13059, - "30": 0.12871, - "31": 0.12957, - "32": 0.1298, - "33": 0.13011, - "34": 0.12939, - "35": 0.12965, - "36": 0.13039, - "37": 0.13099, - "38": 0.13051, - "39": 0.12932, - "40": 0.13052, - "41": 0.13052, - "42": 0.13104, - "43": 0.12938, - "44": 0.13063, - "45": 0.13204, - "46": 0.13075, - "47": 0.13071, - "48": 0.12984, - "49": 0.12965, - "50": 0.12987 + "1": "nan", + "2": 2.42188, + "3": 0.13001, + "4": 0.11595, + "5": 0.1162, + "6": 0.11616, + "7": 0.11716, + "8": 0.11796, + "9": 0.1186, + "10": 0.11774, + "11": 0.11769, + "12": 0.12298, + "13": 0.11717, + "14": 0.11738, + "15": 0.11771, + "16": 0.11772, + "17": 0.11737, + "18": 0.11751, + "19": 0.11697, + "20": 0.11822, + "21": 0.11647, + "22": 0.11724, + "23": 0.11721, + "24": 0.11677, + "25": 0.1171, + "26": 0.11775, + "27": 0.11748, + "28": 0.11705, + "29": 0.11727, + "30": 0.11693, + "31": 0.11818, + "32": 0.11738, + "33": 0.11726, + "34": 0.11675, + "35": 0.11722, + "36": 0.11753, + "37": 0.11779, + "38": 0.11683, + "39": 0.11725, + "40": 0.11779, + "41": 0.11794, + "42": 0.11724, + "43": 0.11807, + "44": 0.11744, + "45": 0.12537, + "46": 0.11752, + "47": 0.11739, + "48": 0.11765, + "49": 0.11763, + "50": 0.11812 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json index 94a972ee670..aa94d697c53 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 6.71621, - "2": 0.15096, - "3": 0.1401, - "4": 0.12751, - "5": 0.12582, - "6": 0.12762, - "7": 0.29588, - "8": 0.12501, - "9": 0.12257, - "10": 0.1231, - "11": 0.12179, - "12": 0.12146, - "13": 0.1218, - "14": 0.12189, - "15": 0.11937, - "16": 0.11701, - "17": 0.16358, - "18": 0.1329, - "19": 0.12356, - "20": 0.1223, - "21": 0.11887, - "22": 0.10873, - "23": 0.11776, - "24": 0.11791, - "25": 0.11708, - "26": 0.11725, - "27": 0.12727, - "28": 0.2171, - "29": 0.1145, - "30": 0.11344, - "31": 0.11497, - "32": 0.11511, - "33": 0.1157, - "34": 0.11565, - "35": 0.11684, - "36": 0.11679, - "37": 0.11675, - "38": 0.11549, - "39": 0.3291, - "40": 0.4913, - "41": 0.12148, - "42": 0.11374, - "43": 0.11395, - "44": 0.11452, - "45": 0.11465, - "46": 0.11512, - "47": 0.11552, - "48": 0.11487, - "49": 0.11358, - "50": 0.11314, - "51": 0.14003, - "52": 0.11456, - "53": 0.11604, - "54": 0.11224, - "55": 0.12526, - "56": 0.11247, - "57": 0.11315, - "58": 0.11222, - "59": 0.11353, - "60": 0.1122, - "61": 0.11312, - "62": 0.11183, - "63": 0.1147, - "64": 0.11171, - "65": 0.11298, - "66": 0.11177, - "67": 0.11322, - "68": 0.11115, - "69": 0.11243, - "70": 0.11245, - "71": 0.1128, - "72": 0.1133, - "73": 0.11263, - "74": 0.11369, - "75": 0.11191, - "76": 0.11291, - "77": 0.11243, - "78": 0.11353, - "79": 0.1277, - "80": 0.11295, - "81": 0.11234, - "82": 0.1138, - "83": 0.11202, - "84": 0.11873, - "85": 0.11198, - "86": 0.11416, - "87": 0.11434, - "88": 0.11401, - "89": 0.11423, - "90": 0.11109, - "91": 0.11252, - "92": 0.11221, - "93": 0.11285, - "94": 0.11189, - "95": 0.11269, - "96": 0.12639, - "97": 0.12758, - "98": 0.12878, - "99": 0.1295, - "100": 0.15151 + "1": "nan", + "2": 2.16002, + "3": 0.12086, + "4": 0.10679, + "5": 0.10684, + "6": 0.10764, + "7": 0.10756, + "8": 0.10782, + "9": 0.10736, + "10": 0.10699, + "11": 0.10758, + "12": 0.1082, + "13": 0.10728, + "14": 0.10701, + "15": 0.10751, + "16": 0.10779, + "17": 0.1525, + "18": 0.12315, + "19": 0.11877, + "20": 0.11834, + "21": 0.11693, + "22": 0.10869, + "23": 0.11758, + "24": 0.11876, + "25": 0.1157, + "26": 0.11704, + "27": 0.11721, + "28": 0.11734, + "29": 0.11883, + "30": 0.11662, + "31": 0.11713, + "32": 0.11867, + "33": 0.11765, + "34": 0.11637, + "35": 0.1163, + "36": 0.11685, + "37": 0.11693, + "38": 0.11691, + "39": 0.11511, + "40": 0.11608, + "41": 0.11592, + "42": 0.11755, + "43": 0.11699, + "44": 0.1167, + "45": 0.11741, + "46": 0.11687, + "47": 0.11681, + "48": 0.11668, + "49": 0.11692, + "50": 0.11769, + "51": 0.13585, + "52": 0.11734, + "53": 0.11685, + "54": 0.11694, + "55": 0.11628, + "56": 0.11632, + "57": 0.11669, + "58": 0.11528, + "59": 0.11499, + "60": 0.11541, + "61": 0.11621, + "62": 0.11572, + "63": 0.11627, + "64": 0.11666, + "65": 0.11549, + "66": 0.11562, + "67": 0.11651, + "68": 0.11467, + "69": 0.11506, + "70": 0.11625, + "71": 0.11703, + "72": 0.11635, + "73": 0.11771, + "74": 0.1156, + "75": 0.11766, + "76": 0.11632, + "77": 0.11535, + "78": 0.11674, + "79": 0.11793, + "80": 0.1173, + "81": 0.11677, + "82": 0.11736, + "83": 0.11611, + "84": 0.11798, + "85": 0.11839, + "86": 0.11892, + "87": 0.11724, + "88": 0.11663, + "89": 0.11722, + "90": 0.11751, + "91": 0.11711, + "92": 0.11773, + "93": 0.11853, + "94": 0.11655, + "95": 0.11767, + "96": 0.11808, + "97": 0.11824, + "98": 0.11911, + "99": 0.11735, + "100": 0.11751 } }, "num-zeros": { diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..a0ffc9cfd0d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 10.13864, + "52": 10.03803, + "53": 10.2953, + "54": 10.19383, + "55": 10.14359, + "56": 9.8908, + "57": 9.73702, + "58": 10.05022, + "59": 9.83828, + "60": 9.74551, + "61": 9.90679, + "62": 10.16216, + "63": 9.59842, + "64": 9.95194, + "65": 9.18904, + "66": 9.87164, + "67": 9.56047, + "68": 9.94233, + "69": 9.94285, + "70": 9.8854, + "71": 9.77852, + "72": 9.73861, + "73": 9.63511, + "74": 9.10351, + "75": 9.55716, + "76": 9.23197, + "77": 10.16792, + "78": 9.83943, + "79": 9.49691, + "80": 9.52327, + "81": 9.60219, + "82": 9.8054, + "83": 9.43936, + "84": 9.51953, + "85": 9.72086, + "86": 9.18604, + "87": 9.68762, + "88": 9.84868, + "89": 9.70441, + "90": 9.91638, + "91": 9.45088, + "92": 9.45495, + "93": 9.1952, + "94": 8.93245, + "95": 9.61119, + "96": 9.62586, + "97": 9.39727, + "98": 9.76341, + "99": 8.99611, + "100": 9.50318 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2321.0, + "52": 2270.0, + "53": 2929.0, + "54": 2493.0, + "55": 2470.0, + "56": 2387.0, + "57": 2321.0, + "58": 2774.0, + "59": 2339.0, + "60": 2654.0, + "61": 2810.0, + "62": 2863.0, + "63": 2582.0, + "64": 2851.0, + "65": 2686.0, + "66": 2969.0, + "67": 2680.0, + "68": 2913.0, + "69": 2669.0, + "70": 2988.0, + "71": 2881.0, + "72": 2465.0, + "73": 3188.0, + "74": 2209.0, + "75": 2665.0, + "76": 3308.0, + "77": 3227.0, + "78": 3393.0, + "79": 3433.0, + "80": 3273.0, + "81": 3620.0, + "82": 3491.0, + "83": 3140.0, + "84": 3225.0, + "85": 3622.0, + "86": 3290.0, + "87": 4023.0, + "88": 3187.0, + "89": 3975.0, + "90": 3576.0, + "91": 2689.0, + "92": 3474.0, + "93": 3202.0, + "94": 3608.0, + "95": 3510.0, + "96": 3634.0, + "97": 3500.0, + "98": 3933.0, + "99": 3502.0, + "100": 3134.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1044076032.0, + "52": 1044076032.0, + "53": 1044076032.0, + "54": 1044076032.0, + "55": 1044076032.0, + "56": 1044076032.0, + "57": 1044076032.0, + "58": 1044076032.0, + "59": 1044076032.0, + "60": 1044076032.0, + "61": 1044076032.0, + "62": 1044076032.0, + "63": 1044076032.0, + "64": 1044076032.0, + "65": 1044076032.0, + "66": 1044076032.0, + "67": 1044076032.0, + "68": 1044076032.0, + "69": 1044076032.0, + "70": 1044076032.0, + "71": 1044076032.0, + "72": 1044076032.0, + "73": 1044076032.0, + "74": 1044076032.0, + "75": 1044076032.0, + "76": 1044076032.0, + "77": 1044076032.0, + "78": 1044076032.0, + "79": 1044076032.0, + "80": 1044076032.0, + "81": 1044076032.0, + "82": 1044076032.0, + "83": 1044076032.0, + "84": 1044076032.0, + "85": 1044076032.0, + "86": 1044076032.0, + "87": 1044076032.0, + "88": 1044076032.0, + "89": 1044076032.0, + "90": 1044076032.0, + "91": 1044076032.0, + "92": 1044076032.0, + "93": 1044076032.0, + "94": 1044076032.0, + "95": 1044076032.0, + "96": 1044076032.0, + "97": 1044076032.0, + "98": 1044076032.0, + "99": 1044076032.0, + "100": 1044076032.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 3546769920.0, + "52": 3546770944.0, + "53": 3546770944.0, + "54": 3546770944.0, + "55": 3546770944.0, + "56": 3546770944.0, + "57": 3546770944.0, + "58": 3546770944.0, + "59": 3546770944.0, + "60": 3546770944.0, + "61": 3546770944.0, + "62": 3546770944.0, + "63": 3546770944.0, + "64": 3546770944.0, + "65": 3546770944.0, + "66": 3546770944.0, + "67": 3546770944.0, + "68": 3546770944.0, + "69": 3546770944.0, + "70": 3546770944.0, + "71": 3546770944.0, + "72": 3546770944.0, + "73": 3546770944.0, + "74": 3546770944.0, + "75": 3546770944.0, + "76": 3546770944.0, + "77": 3546770944.0, + "78": 3546770944.0, + "79": 3546770944.0, + "80": 3546770944.0, + "81": 3546770944.0, + "82": 3546770944.0, + "83": 3546770944.0, + "84": 3546770944.0, + "85": 3546770944.0, + "86": 3546770944.0, + "87": 3546770944.0, + "88": 3546770944.0, + "89": 3546770944.0, + "90": 3546770944.0, + "91": 3546770944.0, + "92": 3546770944.0, + "93": 3546770944.0, + "94": 3546770944.0, + "95": 3546770944.0, + "96": 3546770944.0, + "97": 3546770944.0, + "98": 3546770944.0, + "99": 3546770944.0, + "100": 3546770944.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.47868, + "53": 0.1279, + "54": 0.11872, + "55": 0.11801, + "56": 0.11659, + "57": 0.11822, + "58": 0.11942, + "59": 0.11781, + "60": 0.11931, + "61": 0.11745, + "62": 0.11823, + "63": 0.11864, + "64": 0.11936, + "65": 0.11981, + "66": 0.11828, + "67": 0.11814, + "68": 0.11767, + "69": 0.11723, + "70": 0.11851, + "71": 0.11778, + "72": 0.1171, + "73": 0.11843, + "74": 0.11879, + "75": 0.11904, + "76": 0.11937, + "77": 0.11851, + "78": 0.11863, + "79": 0.11746, + "80": 0.11801, + "81": 0.11841, + "82": 0.1189, + "83": 0.11865, + "84": 0.11762, + "85": 0.11705, + "86": 0.11788, + "87": 0.11804, + "88": 0.11814, + "89": 0.11967, + "90": 0.11938, + "91": 0.11768, + "92": 0.1176, + "93": 0.1189, + "94": 0.1188, + "95": 0.11753, + "96": 0.1179, + "97": 0.12698, + "98": 0.13715, + "99": 0.1402, + "100": 0.13853 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json index ba4bf2c3eaf..78a164b057a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7.95704, - "2": 0.23282, - "3": 0.22573, - "4": 0.14406, - "5": 0.14176, - "6": 0.14066, - "7": 0.14191, - "8": 0.13977, - "9": 0.23575, - "10": 0.14253, - "11": 0.14269, - "12": 0.24047, - "13": 0.18824, - "14": 0.18624, - "15": 0.30512, - "16": 0.14193, - "17": 0.2268, - "18": 0.14073, - "19": 0.23385, - "20": 0.20206, - "21": 0.1413, - "22": 0.13909, - "23": 0.35016, - "24": 0.14315, - "25": 0.22043, - "26": 0.14108, - "27": 0.14032, - "28": 0.14199, - "29": 0.38987, - "30": 0.14061, - "31": 0.14114, - "32": 0.14198, - "33": 0.21726, - "34": 0.14506, - "35": 0.14599, - "36": 0.14386, - "37": 0.14357, - "38": 0.22005, - "39": 0.14191, - "40": 0.14088, - "41": 0.23965, - "42": 0.14104, - "43": 0.21167, - "44": 0.13993, - "45": 0.2299, - "46": 0.24126, - "47": 0.14128, - "48": 0.14024, - "49": 0.22136, - "50": 0.14147 + "1": "nan", + "2": 2.22981, + "3": 0.15128, + "4": 0.13923, + "5": 0.13787, + "6": 0.13801, + "7": 0.13851, + "8": 0.13805, + "9": 0.13877, + "10": 0.14054, + "11": 0.14025, + "12": 0.13996, + "13": 0.13989, + "14": 0.13978, + "15": 0.14117, + "16": 0.14293, + "17": 0.14179, + "18": 0.14229, + "19": 0.14245, + "20": 0.14412, + "21": 0.14397, + "22": 0.1442, + "23": 0.14329, + "24": 0.14358, + "25": 0.14351, + "26": 0.14424, + "27": 0.14406, + "28": 0.1431, + "29": 0.14373, + "30": 0.14433, + "31": 0.14377, + "32": 0.14346, + "33": 0.14433, + "34": 0.14352, + "35": 0.1446, + "36": 0.14442, + "37": 0.14373, + "38": 0.14265, + "39": 0.14371, + "40": 0.14411, + "41": 0.14415, + "42": 0.14342, + "43": 0.14536, + "44": 0.14415, + "45": 0.14252, + "46": 0.14463, + "47": 0.1438, + "48": 0.14396, + "49": 0.14369, + "50": 0.14335 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json index 51e39254e9a..1a14c45ef7f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 7.92082, - "2": 0.15881, - "3": 0.1483, - "4": 0.13026, - "5": 0.247, - "6": 0.1291, - "7": 0.24882, - "8": 0.12936, - "9": 0.24806, - "10": 0.21162, - "11": 0.12953, - "12": 0.22212, - "13": 0.12944, - "14": 0.12909, - "15": 0.23074, - "16": 0.1288, - "17": 0.28155, - "18": 0.14734, - "19": 0.13796, - "20": 0.13594, - "21": 0.23092, - "22": 0.21716, - "23": 0.13639, - "24": 0.1359, - "25": 0.2221, - "26": 0.16419, - "27": 0.14414, - "28": 0.21146, - "29": 0.13469, - "30": 0.22441, - "31": 0.13661, - "32": 0.13647, - "33": 0.13579, - "34": 0.13549, - "35": 0.13504, - "36": 0.13513, - "37": 0.13527, - "38": 0.19634, - "39": 0.3711, - "40": 0.1353, - "41": 0.13666, - "42": 0.21568, - "43": 0.13653, - "44": 0.13523, - "45": 0.13504, - "46": 0.13584, - "47": 0.13676, - "48": 0.13449, - "49": 0.22259, - "50": 0.14061, - "51": 0.34203, - "52": 0.44673, - "53": 0.30462, - "54": 0.34485, - "55": 0.36971, - "56": 0.37478, - "57": 0.3581, - "58": 0.46665, - "59": 0.47512, - "60": 0.38197, - "61": 0.40684, - "62": 0.48548, - "63": 0.32955, - "64": 0.28002, - "65": 0.1858, - "66": 0.1488, - "67": 0.21555, - "68": 0.17819, - "69": 0.24009, - "70": 0.18827, - "71": 0.17896, - "72": 0.18197, - "73": 0.13026, - "74": 0.21407, - "75": 0.13008, - "76": 0.12912, - "77": 0.12908, - "78": 0.13051, - "79": 0.12938, - "80": 0.13039, - "81": 0.1314, - "82": 0.40745, - "83": 0.12931, - "84": 0.13085, - "85": 0.13025, - "86": 0.13101, - "87": 0.12901, - "88": 0.12981, - "89": 0.12874, - "90": 0.12891, - "91": 0.13086, - "92": 0.19117, - "93": 0.1298, - "94": 0.13035, - "95": 0.12884, - "96": 0.12875, - "97": 0.13072, - "98": 0.14893, - "99": 0.13089, - "100": 0.13044 + "1": "nan", + "2": 2.57843, + "3": 0.14384, + "4": 0.12903, + "5": 0.12836, + "6": 0.12816, + "7": 0.12916, + "8": 0.12856, + "9": 0.12834, + "10": 0.12806, + "11": 0.12799, + "12": 0.1275, + "13": 0.12711, + "14": 0.12751, + "15": 0.129, + "16": 0.12848, + "17": 0.16638, + "18": 0.14237, + "19": 0.1354, + "20": 0.13451, + "21": 0.13484, + "22": 0.1296, + "23": 0.134, + "24": 0.13542, + "25": 0.13555, + "26": 0.13391, + "27": 0.13338, + "28": 0.13321, + "29": 0.13407, + "30": 0.13362, + "31": 0.13566, + "32": 0.13345, + "33": 0.13445, + "34": 0.13432, + "35": 0.13463, + "36": 0.1333, + "37": 0.13493, + "38": 0.1341, + "39": 0.13366, + "40": 0.14828, + "41": 0.15021, + "42": 0.14974, + "43": 0.15118, + "44": 0.15264, + "45": 0.15167, + "46": 0.15228, + "47": 0.15164, + "48": 0.15268, + "49": 0.15149, + "50": 0.15349, + "51": 0.18359, + "52": 0.15225, + "53": 0.14909, + "54": 0.1498, + "55": 0.14962, + "56": 0.14941, + "57": 0.14896, + "58": 0.14931, + "59": 0.149, + "60": 0.14965, + "61": 0.15012, + "62": 0.15003, + "63": 0.14869, + "64": 0.14833, + "65": 0.14957, + "66": 0.14978, + "67": 0.14987, + "68": 0.15037, + "69": 0.15122, + "70": 0.15108, + "71": 0.14994, + "72": 0.1507, + "73": 0.15127, + "74": 0.15073, + "75": 0.14986, + "76": 0.15012, + "77": 0.15071, + "78": 0.15091, + "79": 0.15129, + "80": 0.15073, + "81": 0.15107, + "82": 0.15091, + "83": 0.15083, + "84": 0.15069, + "85": 0.15272, + "86": 0.1517, + "87": 0.15055, + "88": 0.15217, + "89": 0.15281, + "90": 0.14893, + "91": 0.16006, + "92": 0.15632, + "93": 0.15975, + "94": 0.1591, + "95": 0.15873, + "96": 0.15918, + "97": 0.15958, + "98": 0.15854, + "99": 0.15737, + "100": 0.15785 } }, "num-zeros": { diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..8d115483589 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 10.18063, + "52": 10.07636, + "53": 10.32773, + "54": 10.23662, + "55": 10.17779, + "56": 9.93459, + "57": 9.79047, + "58": 10.09308, + "59": 9.88561, + "60": 9.79776, + "61": 9.94517, + "62": 10.19094, + "63": 9.64683, + "64": 9.98455, + "65": 9.23395, + "66": 9.90453, + "67": 9.59582, + "68": 9.97649, + "69": 9.97495, + "70": 9.91345, + "71": 9.81704, + "72": 9.7724, + "73": 9.6613, + "74": 9.13276, + "75": 9.5758, + "76": 9.25498, + "77": 10.18582, + "78": 9.86011, + "79": 9.51637, + "80": 9.54101, + "81": 9.61959, + "82": 9.8199, + "83": 9.45715, + "84": 9.53646, + "85": 9.73396, + "86": 9.19313, + "87": 9.70118, + "88": 9.85742, + "89": 9.71286, + "90": 9.92642, + "91": 9.46223, + "92": 9.46428, + "93": 9.20456, + "94": 8.93882, + "95": 9.61804, + "96": 9.62982, + "97": 9.40186, + "98": 9.76277, + "99": 9.00132, + "100": 9.50913 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2281.0, + "52": 2534.0, + "53": 3604.0, + "54": 2989.0, + "55": 2624.0, + "56": 2547.0, + "57": 2287.0, + "58": 3322.0, + "59": 2730.0, + "60": 2919.0, + "61": 3007.0, + "62": 3131.0, + "63": 3226.0, + "64": 3219.0, + "65": 2422.0, + "66": 3741.0, + "67": 2805.0, + "68": 3215.0, + "69": 2871.0, + "70": 3597.0, + "71": 3045.0, + "72": 2952.0, + "73": 3559.0, + "74": 2232.0, + "75": 2889.0, + "76": 3802.0, + "77": 3635.0, + "78": 3762.0, + "79": 4000.0, + "80": 3383.0, + "81": 4629.0, + "82": 3435.0, + "83": 3254.0, + "84": 3786.0, + "85": 3895.0, + "86": 3338.0, + "87": 4169.0, + "88": 3498.0, + "89": 4065.0, + "90": 3825.0, + "91": 3040.0, + "92": 4399.0, + "93": 3899.0, + "94": 4449.0, + "95": 4017.0, + "96": 3820.0, + "97": 4268.0, + "98": 5094.0, + "99": 3940.0, + "100": 3369.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 853400064.0, + "52": 853400064.0, + "53": 853400064.0, + "54": 853400064.0, + "55": 853400064.0, + "56": 853400064.0, + "57": 853400064.0, + "58": 853400064.0, + "59": 853400064.0, + "60": 853400064.0, + "61": 853400064.0, + "62": 853400064.0, + "63": 853400064.0, + "64": 853400064.0, + "65": 853400064.0, + "66": 853400064.0, + "67": 853400064.0, + "68": 853400064.0, + "69": 853400064.0, + "70": 853400064.0, + "71": 853400064.0, + "72": 853400064.0, + "73": 853400064.0, + "74": 853400064.0, + "75": 853400064.0, + "76": 853400064.0, + "77": 853400064.0, + "78": 853400064.0, + "79": 853400064.0, + "80": 853400064.0, + "81": 853400064.0, + "82": 853400064.0, + "83": 853400064.0, + "84": 853400064.0, + "85": 853400064.0, + "86": 853400064.0, + "87": 853400064.0, + "88": 853400064.0, + "89": 853400064.0, + "90": 853400064.0, + "91": 853400064.0, + "92": 853400064.0, + "93": 853400064.0, + "94": 853400064.0, + "95": 853400064.0, + "96": 853400064.0, + "97": 853400064.0, + "98": 853400064.0, + "99": 853400064.0, + "100": 853400064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2677025280.0, + "52": 2677026304.0, + "53": 2677026304.0, + "54": 2677026304.0, + "55": 2677026304.0, + "56": 2677026304.0, + "57": 2677026304.0, + "58": 2677026304.0, + "59": 2677026304.0, + "60": 2677026304.0, + "61": 2677026304.0, + "62": 2677026304.0, + "63": 2677026304.0, + "64": 2677026304.0, + "65": 2677026304.0, + "66": 2677026304.0, + "67": 2677026304.0, + "68": 2677026304.0, + "69": 2677026304.0, + "70": 2677026304.0, + "71": 2677026304.0, + "72": 2677026304.0, + "73": 2677026304.0, + "74": 2677026304.0, + "75": 2677026304.0, + "76": 2677026304.0, + "77": 2677026304.0, + "78": 2677026304.0, + "79": 2677026304.0, + "80": 2677026304.0, + "81": 2677026304.0, + "82": 2677026304.0, + "83": 2677026304.0, + "84": 2677026304.0, + "85": 2677026304.0, + "86": 2677026304.0, + "87": 2677026304.0, + "88": 2677026304.0, + "89": 2677026304.0, + "90": 2677026304.0, + "91": 2677026304.0, + "92": 2677026304.0, + "93": 2677026304.0, + "94": 2677026304.0, + "95": 2677026304.0, + "96": 2677026304.0, + "97": 2677026304.0, + "98": 2677026304.0, + "99": 2677026304.0, + "100": 2677026304.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 2.52584, + "53": 0.20392, + "54": 0.16414, + "55": 0.13847, + "56": 0.13338, + "57": 0.13357, + "58": 0.13375, + "59": 0.13394, + "60": 0.1334, + "61": 0.1341, + "62": 0.13391, + "63": 0.13459, + "64": 0.13308, + "65": 0.13494, + "66": 0.13405, + "67": 0.13432, + "68": 0.13481, + "69": 0.13446, + "70": 0.13476, + "71": 0.13398, + "72": 0.13448, + "73": 0.13508, + "74": 0.13535, + "75": 0.13502, + "76": 0.13588, + "77": 0.13483, + "78": 0.13626, + "79": 0.13542, + "80": 0.13571, + "81": 0.13587, + "82": 0.13461, + "83": 0.13533, + "84": 0.13399, + "85": 0.13532, + "86": 0.13468, + "87": 0.13492, + "88": 0.13529, + "89": 0.13664, + "90": 0.13526, + "91": 0.13377, + "92": 0.13367, + "93": 0.13265, + "94": 0.13355, + "95": 0.13376, + "96": 0.13303, + "97": 0.13448, + "98": 0.13371, + "99": 0.13395, + "100": 0.1334 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json index 49586883019..0322a9120bd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 4.10681, - "2": 0.30693, - "3": 0.30146, - "4": 0.29106, - "5": 0.29089, - "6": 0.29112, - "7": 0.29159, - "8": 0.29115, - "9": 0.29086, - "10": 0.63125, - "11": 0.2989, - "12": 0.29641, - "13": 0.29201, - "14": 0.29541, - "15": 0.29003, - "16": 0.36384, - "17": 0.29423, - "18": 0.30498, - "19": 0.30687, - "20": 0.30695, - "21": 0.30562, - "22": 0.29047, - "23": 0.30755, - "24": 0.30627, - "25": 0.65941, - "26": 0.30667, - "27": 0.31536, - "28": 0.30722, - "29": 0.30542, - "30": 0.30564, - "31": 0.3045, - "32": 0.30472, - "33": 0.30551, - "34": 0.30423, - "35": 0.3045, - "36": 0.30479, - "37": 0.30596, - "38": 0.30404, - "39": 0.30411, - "40": 0.30491, - "41": 0.3071, - "42": 0.30318, - "43": 0.30217, - "44": 0.30293, - "45": 0.3041, - "46": 0.30338, - "47": 0.3038, - "48": 0.30224, - "49": 0.30264, - "50": 0.3024, - "51": 0.36516, - "52": 0.42479, - "53": 0.43225, - "54": 0.37389, - "55": 0.34351, - "56": 0.66697, - "57": 0.30412, - "58": 0.30714, - "59": 0.31209, - "60": 0.33472, - "61": 0.36046, - "62": 0.39323, - "63": 0.4363, - "64": 0.46158, - "65": 0.43859, - "66": 0.3596, - "67": 0.34843, - "68": 0.69171, - "69": 0.35185, - "70": 0.34317, - "71": 0.34189, - "72": 0.3408, - "73": 0.34132, - "74": 0.33999, - "75": 0.33341, - "76": 0.339, - "77": 0.34005, - "78": 0.33524, - "79": 0.65413, - "80": 0.3407, - "81": 0.33061, - "82": 0.33345, - "83": 0.3333, - "84": 0.33362, - "85": 0.33251, - "86": 0.3337, - "87": 0.33386, - "88": 0.6509, - "89": 0.33263, - "90": 0.32972, - "91": 0.32543, - "92": 0.32519, - "93": 0.32484, - "94": 0.32156, - "95": 0.32526, - "96": 0.32111, - "97": 0.32404, - "98": 0.31936, - "99": 0.31881, - "100": 0.31797 + "1": "nan", + "2": 2.03297, + "3": 0.32699, + "4": 0.31597, + "5": 0.31552, + "6": 0.3147, + "7": 0.31402, + "8": 0.31397, + "9": 0.31445, + "10": 0.31986, + "11": 0.31186, + "12": 0.30888, + "13": 0.3063, + "14": 0.30631, + "15": 0.30635, + "16": 0.38476, + "17": 0.31287, + "18": 0.32251, + "19": 0.32317, + "20": 0.31607, + "21": 0.31688, + "22": 0.30059, + "23": 0.31719, + "24": 0.3226, + "25": 0.31854, + "26": 0.31658, + "27": 0.31847, + "28": 0.3176, + "29": 0.31877, + "30": 0.31787, + "31": 0.31758, + "32": 0.31563, + "33": 0.31399, + "34": 0.31522, + "35": 0.31891, + "36": 0.3187, + "37": 0.32382, + "38": 0.32042, + "39": 0.32237, + "40": 0.32145, + "41": 0.31906, + "42": 0.31768, + "43": 0.32041, + "44": 0.31937, + "45": 0.32089, + "46": 0.31845, + "47": 0.31938, + "48": 0.31552, + "49": 0.31733, + "50": 0.3196, + "51": 0.31383, + "52": 0.31412, + "53": 0.31249, + "54": 0.3127, + "55": 0.31762, + "56": 0.31919, + "57": 0.31929, + "58": 0.32119, + "59": 0.31667, + "60": 0.32308, + "61": 0.31738, + "62": 0.32278, + "63": 0.31714, + "64": 0.31073, + "65": 0.30929, + "66": 0.30856, + "67": 0.31027, + "68": 0.3103, + "69": 0.31124, + "70": 0.30886, + "71": 0.30892, + "72": 0.31237, + "73": 0.31647, + "74": 0.31733, + "75": 0.31764, + "76": 0.3165, + "77": 0.31656, + "78": 0.3176, + "79": 0.31747, + "80": 0.3171, + "81": 0.31656, + "82": 0.3168, + "83": 0.31697, + "84": 0.3181, + "85": 0.31755, + "86": 0.31749, + "87": 0.31765, + "88": 0.31775, + "89": 0.31806, + "90": 0.31417, + "91": 0.64575, + "92": 0.3228, + "93": 0.3237, + "94": 0.32187, + "95": 0.32154, + "96": 0.32116, + "97": 0.33046, + "98": 0.35266, + "99": 0.32136, + "100": 0.32174 } }, "num-zeros": { diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..121b3fe11b7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 10.23426, + "52": 10.13488, + "53": 10.35279, + "54": 10.26189, + "55": 10.20983, + "56": 9.99599, + "57": 9.87962, + "58": 10.13391, + "59": 9.92304, + "60": 9.85379, + "61": 9.97314, + "62": 10.211, + "63": 9.70514, + "64": 10.01457, + "65": 9.30759, + "66": 9.9366, + "67": 9.63221, + "68": 9.98219, + "69": 9.98048, + "70": 9.92986, + "71": 9.81575, + "72": 9.79602, + "73": 9.69104, + "74": 9.20049, + "75": 9.61228, + "76": 9.28906, + "77": 10.19068, + "78": 9.86601, + "79": 9.53855, + "80": 9.5578, + "81": 9.63332, + "82": 9.82853, + "83": 9.47188, + "84": 9.54101, + "85": 9.74266, + "86": 9.2142, + "87": 9.7016, + "88": 9.86604, + "89": 9.72339, + "90": 9.92767, + "91": 9.47045, + "92": 9.46809, + "93": 9.21217, + "94": 8.94887, + "95": 9.62787, + "96": 9.6406, + "97": 9.40839, + "98": 9.77147, + "99": 9.00853, + "100": 9.51225 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2080.0, + "52": 2200.0, + "53": 3412.0, + "54": 2641.0, + "55": 2229.0, + "56": 2244.0, + "57": 2057.0, + "58": 3223.0, + "59": 2431.0, + "60": 2650.0, + "61": 2712.0, + "62": 2995.0, + "63": 2816.0, + "64": 2860.0, + "65": 2015.0, + "66": 3176.0, + "67": 2529.0, + "68": 3108.0, + "69": 2873.0, + "70": 3540.0, + "71": 2904.0, + "72": 2693.0, + "73": 3253.0, + "74": 1981.0, + "75": 2780.0, + "76": 3465.0, + "77": 3649.0, + "78": 3593.0, + "79": 3981.0, + "80": 3458.0, + "81": 5181.0, + "82": 3334.0, + "83": 2956.0, + "84": 3527.0, + "85": 3711.0, + "86": 3209.0, + "87": 4133.0, + "88": 3443.0, + "89": 4295.0, + "90": 3801.0, + "91": 2958.0, + "92": 4311.0, + "93": 3544.0, + "94": 4264.0, + "95": 4042.0, + "96": 3849.0, + "97": 3974.0, + "98": 4971.0, + "99": 4071.0, + "100": 3363.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 414416384.0, + "52": 414416384.0, + "53": 414416384.0, + "54": 414416384.0, + "55": 414416384.0, + "56": 414416384.0, + "57": 414416384.0, + "58": 414416384.0, + "59": 414416384.0, + "60": 414416384.0, + "61": 414416384.0, + "62": 414416384.0, + "63": 414416384.0, + "64": 414416384.0, + "65": 414416384.0, + "66": 414416384.0, + "67": 414416384.0, + "68": 414416384.0, + "69": 414416384.0, + "70": 414416384.0, + "71": 414416384.0, + "72": 414416384.0, + "73": 414416384.0, + "74": 414416384.0, + "75": 414416384.0, + "76": 414416384.0, + "77": 414416384.0, + "78": 414416384.0, + "79": 414416384.0, + "80": 414416384.0, + "81": 414416384.0, + "82": 414416384.0, + "83": 414416384.0, + "84": 414416384.0, + "85": 414416384.0, + "86": 414416384.0, + "87": 414416384.0, + "88": 414416384.0, + "89": 414416384.0, + "90": 414416384.0, + "91": 414416384.0, + "92": 414416384.0, + "93": 414416384.0, + "94": 414416384.0, + "95": 414416384.0, + "96": 414416384.0, + "97": 414416384.0, + "98": 414416384.0, + "99": 414416384.0, + "100": 414416384.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1595256320.0, + "52": 1595257344.0, + "53": 1595257344.0, + "54": 1595257344.0, + "55": 1595257344.0, + "56": 1595257344.0, + "57": 1595257344.0, + "58": 1595257344.0, + "59": 1595257344.0, + "60": 1595257344.0, + "61": 1595257344.0, + "62": 1595257344.0, + "63": 1595257344.0, + "64": 1595257344.0, + "65": 1595257344.0, + "66": 1595257344.0, + "67": 1595257344.0, + "68": 1595257344.0, + "69": 1595257344.0, + "70": 1595257344.0, + "71": 1595257344.0, + "72": 1595257344.0, + "73": 1595257344.0, + "74": 1595257344.0, + "75": 1595257344.0, + "76": 1595257344.0, + "77": 1595257344.0, + "78": 1595257344.0, + "79": 1595257344.0, + "80": 1595257344.0, + "81": 1595257344.0, + "82": 1595257344.0, + "83": 1595257344.0, + "84": 1595257344.0, + "85": 1595257344.0, + "86": 1595257344.0, + "87": 1595257344.0, + "88": 1595257344.0, + "89": 1595257344.0, + "90": 1595257344.0, + "91": 1595257344.0, + "92": 1595257344.0, + "93": 1595257344.0, + "94": 1595257344.0, + "95": 1595257344.0, + "96": 1595257344.0, + "97": 1595257344.0, + "98": 1595257344.0, + "99": 1595257344.0, + "100": 1595257344.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 1.90434, + "53": 0.34839, + "54": 0.46608, + "55": 0.31197, + "56": 0.30786, + "57": 0.30592, + "58": 0.30764, + "59": 0.30985, + "60": 0.30933, + "61": 0.30847, + "62": 0.30963, + "63": 0.31127, + "64": 0.30722, + "65": 0.31035, + "66": 0.31029, + "67": 0.30982, + "68": 0.30966, + "69": 0.30823, + "70": 0.30879, + "71": 0.3091, + "72": 0.30555, + "73": 0.30938, + "74": 0.3098, + "75": 0.31215, + "76": 0.30896, + "77": 0.31066, + "78": 0.31001, + "79": 0.30673, + "80": 0.30699, + "81": 0.30771, + "82": 0.3072, + "83": 0.30839, + "84": 0.30892, + "85": 0.30911, + "86": 0.30528, + "87": 0.30757, + "88": 0.30812, + "89": 0.3083, + "90": 0.30825, + "91": 0.30638, + "92": 0.30467, + "93": 0.30582, + "94": 0.30847, + "95": 0.30633, + "96": 0.30614, + "97": 0.30644, + "98": 0.30185, + "99": 0.30221, + "100": 0.30191 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json index 3def3c8618f..b1ec74d48d3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 4.43718, - "2": 0.32141, - "3": 0.34228, - "4": 0.3338, - "5": 0.33423, - "6": 0.33597, - "7": 0.33749, - "8": 0.33831, - "9": 0.34047, - "10": 0.33938, - "11": 0.3381, - "12": 0.34241, - "13": 0.35311, - "14": 0.35495, - "15": 0.33902, - "16": 0.42658, - "17": 0.3452, - "18": 0.35813, - "19": 0.35538, - "20": 0.36232, - "21": 0.36626, - "22": 0.3555, - "23": 0.36916, - "24": 0.3744, - "25": 0.37348, - "26": 0.36915, - "27": 0.37147, - "28": 0.36445, - "29": 0.36069, - "30": 0.35961, - "31": 0.35274, - "32": 0.35514, - "33": 0.35563, - "34": 0.35744, - "35": 0.35843, - "36": 0.35512, - "37": 0.35839, - "38": 0.35761, - "39": 0.35765, - "40": 0.62747, - "41": 0.35467, - "42": 0.35928, - "43": 0.35301, - "44": 0.35215, - "45": 0.35947, - "46": 0.35676, - "47": 0.65816, - "48": 0.35624, - "49": 0.35833, - "50": 0.35593, - "51": 0.38053, - "52": 0.74045, - "53": 0.36063, - "54": 0.36054, - "55": 0.363, - "56": 0.36264, - "57": 0.36262, - "58": 0.36213, - "59": 0.36223, - "60": 0.35979, - "61": 0.36002, - "62": 0.36456, - "63": 0.36092, - "64": 0.36222, - "65": 0.36214, - "66": 0.36393, - "67": 0.36348, - "68": 0.36404, - "69": 0.36256, - "70": 0.36106, - "71": 0.36265, - "72": 0.36127, - "73": 0.37126, - "74": 0.3637, - "75": 0.36407, - "76": 0.36415, - "77": 0.36331, - "78": 0.3641, - "79": 0.36546, - "80": 0.36427, - "81": 0.35664, - "82": 0.36196, - "83": 0.36259, - "84": 0.36282, - "85": 0.36131, - "86": 0.35889, - "87": 0.36236, - "88": 0.35979, - "89": 0.36186, - "90": 0.36471, - "91": 0.36565, - "92": 0.36403, - "93": 0.365, - "94": 0.36272, - "95": 0.36119, - "96": 0.36129, - "97": 0.36262, - "98": 0.36263, - "99": 0.36514, - "100": 0.36392 + "1": "nan", + "2": 2.06149, + "3": 0.29865, + "4": 0.28631, + "5": 0.28736, + "6": 0.28526, + "7": 0.29059, + "8": 0.28917, + "9": 0.28615, + "10": 0.29015, + "11": 0.28709, + "12": 0.28726, + "13": 0.28539, + "14": 0.28815, + "15": 0.28809, + "16": 0.36186, + "17": 0.29264, + "18": 0.30232, + "19": 0.30149, + "20": 0.3025, + "21": 0.30312, + "22": 0.28815, + "23": 0.30003, + "24": 0.30279, + "25": 0.30076, + "26": 0.31198, + "27": 0.30137, + "28": 0.30174, + "29": 0.30053, + "30": 0.2998, + "31": 0.30129, + "32": 0.30266, + "33": 0.30179, + "34": 0.30183, + "35": 0.30504, + "36": 0.30132, + "37": 0.30289, + "38": 0.30052, + "39": 0.29983, + "40": 0.29924, + "41": 0.30028, + "42": 0.29816, + "43": 0.30081, + "44": 0.3016, + "45": 0.30595, + "46": 0.30403, + "47": 0.30454, + "48": 0.30318, + "49": 0.30105, + "50": 0.30174, + "51": 0.3175, + "52": 0.29652, + "53": 0.29581, + "54": 0.29555, + "55": 0.29696, + "56": 0.29449, + "57": 0.64732, + "58": 0.30369, + "59": 0.30167, + "60": 0.30075, + "61": 0.29981, + "62": 0.30078, + "63": 0.31092, + "64": 0.30144, + "65": 0.29891, + "66": 0.2987, + "67": 0.29963, + "68": 0.30439, + "69": 0.29787, + "70": 0.3036, + "71": 0.30595, + "72": 0.29733, + "73": 0.29745, + "74": 0.30071, + "75": 0.29706, + "76": 0.2969, + "77": 0.29903, + "78": 0.29958, + "79": 0.29754, + "80": 0.30059, + "81": 0.29879, + "82": 0.30486, + "83": 0.29801, + "84": 0.29892, + "85": 0.2996, + "86": 0.29869, + "87": 0.30043, + "88": 0.29951, + "89": 0.29614, + "90": 0.29973, + "91": 0.30029, + "92": 0.29926, + "93": 0.29973, + "94": 0.29969, + "95": 0.30108, + "96": 0.29798, + "97": 0.29923, + "98": 0.29982, + "99": 0.29854, + "100": 0.3007 } }, "num-zeros": { diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json new file mode 100644 index 00000000000..ae8f01b3327 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 10.23426, + "52": 10.13488, + "53": 10.35279, + "54": 10.26189, + "55": 10.20983, + "56": 9.99599, + "57": 9.87962, + "58": 10.13391, + "59": 9.92304, + "60": 9.85379, + "61": 9.97314, + "62": 10.211, + "63": 9.70514, + "64": 10.01457, + "65": 9.30759, + "66": 9.9366, + "67": 9.63221, + "68": 9.98219, + "69": 9.98048, + "70": 9.92986, + "71": 9.81575, + "72": 9.79602, + "73": 9.69104, + "74": 9.20049, + "75": 9.61228, + "76": 9.28906, + "77": 10.19068, + "78": 9.86601, + "79": 9.53855, + "80": 9.5578, + "81": 9.63332, + "82": 9.82853, + "83": 9.47188, + "84": 9.54101, + "85": 9.74266, + "86": 9.2142, + "87": 9.7016, + "88": 9.86604, + "89": 9.72339, + "90": 9.92767, + "91": 9.47045, + "92": 9.46809, + "93": 9.21217, + "94": 8.94887, + "95": 9.62787, + "96": 9.6406, + "97": 9.40839, + "98": 9.77147, + "99": 9.00853, + "100": 9.51225 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 2080.0, + "52": 2200.0, + "53": 3412.0, + "54": 2641.0, + "55": 2229.0, + "56": 2244.0, + "57": 2057.0, + "58": 3223.0, + "59": 2431.0, + "60": 2650.0, + "61": 2712.0, + "62": 2995.0, + "63": 2816.0, + "64": 2860.0, + "65": 2015.0, + "66": 3176.0, + "67": 2529.0, + "68": 3108.0, + "69": 2873.0, + "70": 3540.0, + "71": 2904.0, + "72": 2693.0, + "73": 3253.0, + "74": 1981.0, + "75": 2780.0, + "76": 3465.0, + "77": 3649.0, + "78": 3593.0, + "79": 3981.0, + "80": 3458.0, + "81": 5181.0, + "82": 3334.0, + "83": 2956.0, + "84": 3527.0, + "85": 3711.0, + "86": 3209.0, + "87": 4133.0, + "88": 3443.0, + "89": 4295.0, + "90": 3801.0, + "91": 2958.0, + "92": 4311.0, + "93": 3544.0, + "94": 4264.0, + "95": 4042.0, + "96": 3849.0, + "97": 3974.0, + "98": 4971.0, + "99": 4071.0, + "100": 3363.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 415464960.0, + "52": 415464960.0, + "53": 415464960.0, + "54": 415464960.0, + "55": 415464960.0, + "56": 415464960.0, + "57": 415464960.0, + "58": 415464960.0, + "59": 415464960.0, + "60": 415464960.0, + "61": 415464960.0, + "62": 415464960.0, + "63": 415464960.0, + "64": 415464960.0, + "65": 415464960.0, + "66": 415464960.0, + "67": 415464960.0, + "68": 415464960.0, + "69": 415464960.0, + "70": 415464960.0, + "71": 415464960.0, + "72": 415464960.0, + "73": 415464960.0, + "74": 415464960.0, + "75": 415464960.0, + "76": 415464960.0, + "77": 415464960.0, + "78": 415464960.0, + "79": 415464960.0, + "80": 415464960.0, + "81": 415464960.0, + "82": 415464960.0, + "83": 415464960.0, + "84": 415464960.0, + "85": 415464960.0, + "86": 415464960.0, + "87": 415464960.0, + "88": 415464960.0, + "89": 415464960.0, + "90": 415464960.0, + "91": 415464960.0, + "92": 415464960.0, + "93": 415464960.0, + "94": 415464960.0, + "95": 415464960.0, + "96": 415464960.0, + "97": 415464960.0, + "98": 415464960.0, + "99": 415464960.0, + "100": 415464960.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": 1596304896.0, + "52": 1596305920.0, + "53": 1596305920.0, + "54": 1596305920.0, + "55": 1596305920.0, + "56": 1596305920.0, + "57": 1596305920.0, + "58": 1596305920.0, + "59": 1596305920.0, + "60": 1596305920.0, + "61": 1596305920.0, + "62": 1596305920.0, + "63": 1596305920.0, + "64": 1596305920.0, + "65": 1596305920.0, + "66": 1596305920.0, + "67": 1596305920.0, + "68": 1596305920.0, + "69": 1596305920.0, + "70": 1596305920.0, + "71": 1596305920.0, + "72": 1596305920.0, + "73": 1596305920.0, + "74": 1596305920.0, + "75": 1596305920.0, + "76": 1596305920.0, + "77": 1596305920.0, + "78": 1596305920.0, + "79": 1596305920.0, + "80": 1596305920.0, + "81": 1596305920.0, + "82": 1596305920.0, + "83": 1596305920.0, + "84": 1596305920.0, + "85": 1596305920.0, + "86": 1596305920.0, + "87": 1596305920.0, + "88": 1596305920.0, + "89": 1596305920.0, + "90": 1596305920.0, + "91": 1596305920.0, + "92": 1596305920.0, + "93": 1596305920.0, + "94": 1596305920.0, + "95": 1596305920.0, + "96": 1596305920.0, + "97": 1596305920.0, + "98": 1596305920.0, + "99": 1596305920.0, + "100": 1596305920.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": "nan", + "19": "nan", + "20": "nan", + "21": "nan", + "22": "nan", + "23": "nan", + "24": "nan", + "25": "nan", + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan", + "51": "nan", + "52": 1.8615, + "53": 0.31355, + "54": 0.3044, + "55": 0.30701, + "56": 0.30736, + "57": 0.31773, + "58": 0.30739, + "59": 0.30021, + "60": 0.30271, + "61": 0.31062, + "62": 0.30743, + "63": 0.30357, + "64": 0.30729, + "65": 0.3028, + "66": 0.30068, + "67": 0.30021, + "68": 0.30051, + "69": 0.30257, + "70": 0.30291, + "71": 0.30183, + "72": 0.30259, + "73": 0.30032, + "74": 0.3007, + "75": 0.30192, + "76": 0.30188, + "77": 0.30125, + "78": 0.30028, + "79": 0.3024, + "80": 0.3115, + "81": 0.3014, + "82": 0.3023, + "83": 0.30861, + "84": 0.30129, + "85": 0.30185, + "86": 0.29936, + "87": 0.30094, + "88": 0.3001, + "89": 0.2993, + "90": 0.2987, + "91": 0.30006, + "92": 0.30091, + "93": 0.30097, + "94": 0.29909, + "95": 0.30113, + "96": 0.29925, + "97": 0.29979, + "98": 0.30241, + "99": 0.30073, + "100": 0.30251 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..5c353a70683 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81386, + "2": 10.8312, + "3": 10.80976, + "4": 10.82429, + "5": 10.84553, + "6": 10.85179, + "7": 10.83654, + "8": 10.83644, + "9": 10.84704, + "10": 10.78251, + "11": 10.85041, + "12": 10.84824, + "13": 10.86042, + "14": 10.86568, + "15": 10.81163, + "16": 10.79013, + "17": 10.76156, + "18": 10.78107, + "19": 10.78131, + "20": 10.70284, + "21": 10.67616, + "22": 10.51867, + "23": 10.70469, + "24": 10.57188, + "25": 10.51628, + "26": 10.58103, + "27": 10.59482, + "28": 10.56661, + "29": 10.58588, + "30": 10.33634, + "31": 10.08426, + "32": 10.4506, + "33": 10.4457, + "34": 10.19772, + "35": 10.25794, + "36": 10.21991, + "37": 10.34564, + "38": 10.18704, + "39": 10.39388, + "40": 10.08233, + "41": 10.13235, + "42": 10.21151, + "43": 9.83045, + "44": 9.94704, + "45": 9.84037, + "46": 9.81454, + "47": 10.12979, + "48": 9.85142, + "49": 9.52861, + "50": 9.91131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4720.0, + "2": 4914.0, + "3": 4776.0, + "4": 4671.0, + "5": 5518.0, + "6": 5824.0, + "7": 4980.0, + "8": 4796.0, + "9": 5167.0, + "10": 4355.0, + "11": 5673.0, + "12": 5426.0, + "13": 5583.0, + "14": 5744.0, + "15": 5108.0, + "16": 5421.0, + "17": 5053.0, + "18": 5299.0, + "19": 5132.0, + "20": 4876.0, + "21": 5310.0, + "22": 4667.0, + "23": 5540.0, + "24": 5085.0, + "25": 4723.0, + "26": 5278.0, + "27": 5336.0, + "28": 5707.0, + "29": 6154.0, + "30": 5376.0, + "31": 4689.0, + "32": 5934.0, + "33": 6223.0, + "34": 5379.0, + "35": 5828.0, + "36": 5708.0, + "37": 6494.0, + "38": 6186.0, + "39": 6680.0, + "40": 6110.0, + "41": 6110.0, + "42": 6339.0, + "43": 5869.0, + "44": 5905.0, + "45": 6036.0, + "46": 5862.0, + "47": 6757.0, + "48": 6445.0, + "49": 6445.0, + "50": 6776.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1144114688.0, + "2": 1144112128.0, + "3": 1144114688.0, + "4": 1144114176.0, + "5": 1144115200.0, + "6": 1144113664.0, + "7": 1144114688.0, + "8": 1144112128.0, + "9": 1144112640.0, + "10": 1144114688.0, + "11": 1144114176.0, + "12": 1144115200.0, + "13": 1144113664.0, + "14": 1144116224.0, + "15": 1144114688.0, + "16": 1144113664.0, + "17": 1144111616.0, + "18": 1144113664.0, + "19": 1144114176.0, + "20": 1144114688.0, + "21": 1144112128.0, + "22": 1144114176.0, + "23": 1144112640.0, + "24": 1144111616.0, + "25": 1144115712.0, + "26": 1144116736.0, + "27": 1144114688.0, + "28": 1144112128.0, + "29": 1144112640.0, + "30": 1144114176.0, + "31": 1144110080.0, + "32": 1144113152.0, + "33": 1144113664.0, + "34": 1144113664.0, + "35": 1144111104.0, + "36": 1144113664.0, + "37": 1144115200.0, + "38": 1144114176.0, + "39": 1144112128.0, + "40": 1144112128.0, + "41": 1144110080.0, + "42": 1144111616.0, + "43": 1144108544.0, + "44": 1144111616.0, + "45": 1144114688.0, + "46": 1144113664.0, + "47": 1144112128.0, + "48": 1144111616.0, + "49": 1144113664.0, + "50": 1144111616.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1592113152.0, + "2": 2049260032.0, + "3": 2049305088.0, + "4": 2049981952.0, + "5": 2049981952.0, + "6": 2049981952.0, + "7": 2049981952.0, + "8": 2049981952.0, + "9": 2049981952.0, + "10": 2050618880.0, + "11": 2050618880.0, + "12": 2050618880.0, + "13": 2050618880.0, + "14": 2050618880.0, + "15": 2050618880.0, + "16": 2050618880.0, + "17": 2050618880.0, + "18": 2050618880.0, + "19": 2050618880.0, + "20": 2050618880.0, + "21": 2050618880.0, + "22": 2050618880.0, + "23": 2050618880.0, + "24": 2050618880.0, + "25": 2050618880.0, + "26": 2050618880.0, + "27": 2050618880.0, + "28": 2050618880.0, + "29": 2050618880.0, + "30": 2050618880.0, + "31": 2050618880.0, + "32": 2050618880.0, + "33": 2050618880.0, + "34": 2050618880.0, + "35": 2050618880.0, + "36": 2050618880.0, + "37": 2050618880.0, + "38": 2050618880.0, + "39": 2050618880.0, + "40": 2050618880.0, + "41": 2050618880.0, + "42": 2050618880.0, + "43": 2050618880.0, + "44": 2050618880.0, + "45": 2050618880.0, + "46": 2050618880.0, + "47": 2050618880.0, + "48": 2050618880.0, + "49": 2050618880.0, + "50": 2050618880.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 7.71822, + "3": 0.41543, + "4": 0.48385, + "5": 0.43238, + "6": 0.4005, + "7": 0.37702, + "8": 0.34877, + "9": 0.34747, + "10": 0.33405, + "11": 0.35433, + "12": 0.4949, + "13": 0.78498, + "14": 0.34468, + "15": 0.34552, + "16": 0.34055, + "17": 0.34051, + "18": 0.33811, + "19": 0.34319, + "20": 0.33811, + "21": 0.34085, + "22": 0.35398, + "23": 0.33545, + "24": 0.3393, + "25": 0.34161, + "26": 0.33744, + "27": 0.33573, + "28": 0.33954, + "29": 0.33344, + "30": 0.33741, + "31": 0.34691, + "32": 0.33581, + "33": 0.3395, + "34": 0.34333, + "35": 0.3424, + "36": 0.34673, + "37": 0.33697, + "38": 0.33705, + "39": 0.33394, + "40": 0.33964, + "41": 0.34276, + "42": 0.3401, + "43": 0.34688, + "44": 0.3413, + "45": 0.33867, + "46": 0.34719, + "47": 0.34606, + "48": 0.35149, + "49": 0.34219, + "50": 0.33349 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..51442760296 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81386, + "2": 10.8312, + "3": 10.80993, + "4": 10.82435, + "5": 10.84542, + "6": 10.85174, + "7": 10.8364, + "8": 10.83623, + "9": 10.84765, + "10": 10.7827, + "11": 10.85034, + "12": 10.84795, + "13": 10.86022, + "14": 10.86553, + "15": 10.81126, + "16": 10.78981, + "17": 10.76199, + "18": 10.78188, + "19": 10.78067, + "20": 10.7031, + "21": 10.67657, + "22": 10.51865, + "23": 10.70519, + "24": 10.57167, + "25": 10.51611, + "26": 10.58127, + "27": 10.59422, + "28": 10.56658, + "29": 10.58518, + "30": 10.33581, + "31": 10.08412, + "32": 10.45077, + "33": 10.4461, + "34": 10.19766, + "35": 10.2585, + "36": 10.21965, + "37": 10.34543, + "38": 10.18739, + "39": 10.39385, + "40": 10.0823, + "41": 10.13221, + "42": 10.21174, + "43": 9.83034, + "44": 9.9469, + "45": 9.84028, + "46": 9.81421, + "47": 10.12976, + "48": 9.85137, + "49": 9.52825, + "50": 9.91126 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4660.0, + "2": 4808.0, + "3": 4883.0, + "4": 4655.0, + "5": 5526.0, + "6": 5649.0, + "7": 4873.0, + "8": 4733.0, + "9": 5165.0, + "10": 4367.0, + "11": 5583.0, + "12": 5440.0, + "13": 5659.0, + "14": 5572.0, + "15": 5101.0, + "16": 5484.0, + "17": 5115.0, + "18": 5200.0, + "19": 5406.0, + "20": 4960.0, + "21": 5420.0, + "22": 4791.0, + "23": 5566.0, + "24": 5019.0, + "25": 4679.0, + "26": 5246.0, + "27": 5433.0, + "28": 5907.0, + "29": 6065.0, + "30": 5409.0, + "31": 4827.0, + "32": 5809.0, + "33": 6243.0, + "34": 5520.0, + "35": 5592.0, + "36": 5754.0, + "37": 6732.0, + "38": 6330.0, + "39": 6779.0, + "40": 6198.0, + "41": 6001.0, + "42": 6274.0, + "43": 5876.0, + "44": 6046.0, + "45": 6084.0, + "46": 5925.0, + "47": 6772.0, + "48": 6415.0, + "49": 6494.0, + "50": 6648.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1144114688.0, + "2": 1144112128.0, + "3": 1144114688.0, + "4": 1144115200.0, + "5": 1144115200.0, + "6": 1144113664.0, + "7": 1144114688.0, + "8": 1144113152.0, + "9": 1144112640.0, + "10": 1144114688.0, + "11": 1144114176.0, + "12": 1144115200.0, + "13": 1144114176.0, + "14": 1144115712.0, + "15": 1144115712.0, + "16": 1144113664.0, + "17": 1144111616.0, + "18": 1144113664.0, + "19": 1144114688.0, + "20": 1144113664.0, + "21": 1144113152.0, + "22": 1144114176.0, + "23": 1144113664.0, + "24": 1144111616.0, + "25": 1144115712.0, + "26": 1144116224.0, + "27": 1144114688.0, + "28": 1144112128.0, + "29": 1144113152.0, + "30": 1144114176.0, + "31": 1144109568.0, + "32": 1144113152.0, + "33": 1144114176.0, + "34": 1144113664.0, + "35": 1144111104.0, + "36": 1144113664.0, + "37": 1144115200.0, + "38": 1144114688.0, + "39": 1144112128.0, + "40": 1144111616.0, + "41": 1144110080.0, + "42": 1144113152.0, + "43": 1144109568.0, + "44": 1144111616.0, + "45": 1144115200.0, + "46": 1144112640.0, + "47": 1144112128.0, + "48": 1144111616.0, + "49": 1144113664.0, + "50": 1144111616.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1592113152.0, + "2": 2049344000.0, + "3": 2049344000.0, + "4": 2049852928.0, + "5": 2050200576.0, + "6": 2050200576.0, + "7": 2050200576.0, + "8": 2050200576.0, + "9": 2050200576.0, + "10": 2050613760.0, + "11": 2050613760.0, + "12": 2050613760.0, + "13": 2050613760.0, + "14": 2050613760.0, + "15": 2050613760.0, + "16": 2050613760.0, + "17": 2050613760.0, + "18": 2050613760.0, + "19": 2050613760.0, + "20": 2050613760.0, + "21": 2050613760.0, + "22": 2050613760.0, + "23": 2050613760.0, + "24": 2050613760.0, + "25": 2050613760.0, + "26": 2050613760.0, + "27": 2050613760.0, + "28": 2050613760.0, + "29": 2050613760.0, + "30": 2050613760.0, + "31": 2050613760.0, + "32": 2050613760.0, + "33": 2050613760.0, + "34": 2050613760.0, + "35": 2050613760.0, + "36": 2050613760.0, + "37": 2050613760.0, + "38": 2050613760.0, + "39": 2050613760.0, + "40": 2050613760.0, + "41": 2050613760.0, + "42": 2050613760.0, + "43": 2050613760.0, + "44": 2050613760.0, + "45": 2050613760.0, + "46": 2050613760.0, + "47": 2050613760.0, + "48": 2050613760.0, + "49": 2050613760.0, + "50": 2050613760.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 6.272, + "3": 0.37646, + "4": 0.38139, + "5": 0.36006, + "6": 0.35438, + "7": 0.37532, + "8": 0.34896, + "9": 0.34666, + "10": 0.34575, + "11": 0.35016, + "12": 0.34334, + "13": 0.34313, + "14": 0.34121, + "15": 0.34333, + "16": 0.33917, + "17": 0.34414, + "18": 0.34158, + "19": 0.33904, + "20": 0.34192, + "21": 0.34305, + "22": 0.35491, + "23": 0.34584, + "24": 0.34162, + "25": 0.34733, + "26": 0.34153, + "27": 0.34246, + "28": 0.34, + "29": 0.33893, + "30": 0.34315, + "31": 0.3468, + "32": 0.34193, + "33": 0.33765, + "34": 0.34671, + "35": 0.33955, + "36": 0.34134, + "37": 0.33879, + "38": 0.34103, + "39": 0.33784, + "40": 0.33992, + "41": 0.3506, + "42": 0.33836, + "43": 0.34282, + "44": 0.33978, + "45": 0.339, + "46": 0.34898, + "47": 0.34512, + "48": 0.35552, + "49": 0.34616, + "50": 0.33258 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..b804ba57a90 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81772, + "2": 10.82538, + "3": 10.82315, + "4": 10.7893, + "5": 10.84414, + "6": 10.85629, + "7": 10.82859, + "8": 10.8301, + "9": 10.84207, + "10": 10.78601, + "11": 10.85733, + "12": 10.84663, + "13": 10.86562, + "14": 10.86914, + "15": 10.81232, + "16": 10.80865, + "17": 10.77965, + "18": 10.80508, + "19": 10.79288, + "20": 10.74264, + "21": 10.72495, + "22": 10.58933, + "23": 10.73854, + "24": 10.63021, + "25": 10.58736, + "26": 10.63591, + "27": 10.66781, + "28": 10.64617, + "29": 10.65842, + "30": 10.44627, + "31": 10.21299, + "32": 10.53989, + "33": 10.52788, + "34": 10.30656, + "35": 10.35429, + "36": 10.31411, + "37": 10.43313, + "38": 10.29142, + "39": 10.47282, + "40": 10.18336, + "41": 10.24081, + "42": 10.30294, + "43": 9.95174, + "44": 10.05781, + "45": 9.9572, + "46": 9.93655, + "47": 10.22836, + "48": 9.95329, + "49": 9.6607, + "50": 9.99855, + "51": 9.94973, + "52": 9.84349, + "53": 10.14413, + "54": 10.04737, + "55": 9.98385, + "56": 9.71898, + "57": 9.5883, + "58": 9.92285, + "59": 9.67628, + "60": 9.60379, + "61": 9.78734, + "62": 10.06656, + "63": 9.47521, + "64": 9.85036, + "65": 9.03212, + "66": 9.78289, + "67": 9.44253, + "68": 9.85795, + "69": 9.85298, + "70": 9.7992, + "71": 9.6974, + "72": 9.66103, + "73": 9.56335, + "74": 9.05976, + "75": 9.50058, + "76": 9.18716, + "77": 10.12117, + "78": 9.78252, + "79": 9.44971, + "80": 9.47021, + "81": 9.54374, + "82": 9.75396, + "83": 9.39966, + "84": 9.46977, + "85": 9.67727, + "86": 9.13918, + "87": 9.64053, + "88": 9.81152, + "89": 9.6769, + "90": 9.8722, + "91": 9.41711, + "92": 9.42414, + "93": 9.1643, + "94": 8.903, + "95": 9.57911, + "96": 9.5909, + "97": 9.35398, + "98": 9.73253, + "99": 8.96675, + "100": 9.46267 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 31327.0, + "2": 32688.0, + "3": 32823.0, + "4": 31506.0, + "5": 36384.0, + "6": 37608.0, + "7": 35129.0, + "8": 31069.0, + "9": 34564.0, + "10": 29747.0, + "11": 38781.0, + "12": 35732.0, + "13": 37194.0, + "14": 38338.0, + "15": 35011.0, + "16": 36251.0, + "17": 34823.0, + "18": 35457.0, + "19": 35578.0, + "20": 32707.0, + "21": 33645.0, + "22": 30136.0, + "23": 38476.0, + "24": 32292.0, + "25": 30916.0, + "26": 34382.0, + "27": 36106.0, + "28": 37666.0, + "29": 38421.0, + "30": 33015.0, + "31": 30489.0, + "32": 36688.0, + "33": 38306.0, + "34": 33425.0, + "35": 34129.0, + "36": 35506.0, + "37": 38441.0, + "38": 35394.0, + "39": 38939.0, + "40": 36115.0, + "41": 36452.0, + "42": 37245.0, + "43": 34000.0, + "44": 33879.0, + "45": 36293.0, + "46": 37265.0, + "47": 40947.0, + "48": 36423.0, + "49": 35090.0, + "50": 40022.0, + "51": 37599.0, + "52": 36874.0, + "53": 42329.0, + "54": 40750.0, + "55": 37208.0, + "56": 39947.0, + "57": 36219.0, + "58": 42369.0, + "59": 39714.0, + "60": 39697.0, + "61": 40288.0, + "62": 44682.0, + "63": 37743.0, + "64": 43466.0, + "65": 40862.0, + "66": 45025.0, + "67": 40213.0, + "68": 40169.0, + "69": 40885.0, + "70": 45480.0, + "71": 41411.0, + "72": 40544.0, + "73": 45712.0, + "74": 34875.0, + "75": 39109.0, + "76": 45477.0, + "77": 45742.0, + "78": 47634.0, + "79": 48400.0, + "80": 46578.0, + "81": 50032.0, + "82": 49469.0, + "83": 45158.0, + "84": 45794.0, + "85": 49099.0, + "86": 45075.0, + "87": 49153.0, + "88": 47648.0, + "89": 49368.0, + "90": 49965.0, + "91": 44550.0, + "92": 46072.0, + "93": 46606.0, + "94": 47182.0, + "95": 47865.0, + "96": 50348.0, + "97": 46303.0, + "98": 49697.0, + "99": 48948.0, + "100": 44134.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 964398080.0, + "2": 964398592.0, + "3": 964397056.0, + "4": 964396032.0, + "5": 964397056.0, + "6": 964397568.0, + "7": 964397056.0, + "8": 964396544.0, + "9": 964396544.0, + "10": 964396032.0, + "11": 964397056.0, + "12": 964393984.0, + "13": 964396544.0, + "14": 964399104.0, + "15": 964396544.0, + "16": 964397568.0, + "17": 964399104.0, + "18": 964397568.0, + "19": 964397056.0, + "20": 964398080.0, + "21": 964397056.0, + "22": 964394496.0, + "23": 964396032.0, + "24": 964395520.0, + "25": 964395008.0, + "26": 964396032.0, + "27": 964397568.0, + "28": 964396032.0, + "29": 964398080.0, + "30": 964397056.0, + "31": 964390912.0, + "32": 965370880.0, + "33": 964398080.0, + "34": 964396544.0, + "35": 964395008.0, + "36": 964395008.0, + "37": 964396032.0, + "38": 964397056.0, + "39": 964397056.0, + "40": 964398080.0, + "41": 964390400.0, + "42": 964396032.0, + "43": 964393472.0, + "44": 964394496.0, + "45": 964396032.0, + "46": 964390912.0, + "47": 964396032.0, + "48": 964389888.0, + "49": 964392960.0, + "50": 964396032.0, + "51": 964395008.0, + "52": 964391936.0, + "53": 964392960.0, + "54": 964390912.0, + "55": 964390400.0, + "56": 964393984.0, + "57": 964384768.0, + "58": 964389888.0, + "59": 964388352.0, + "60": 964390912.0, + "61": 964396032.0, + "62": 964393472.0, + "63": 964391424.0, + "64": 964388864.0, + "65": 964380672.0, + "66": 964391936.0, + "67": 964391936.0, + "68": 964396032.0, + "69": 964390400.0, + "70": 964392448.0, + "71": 964392448.0, + "72": 964388352.0, + "73": 964390912.0, + "74": 964385792.0, + "75": 964396032.0, + "76": 964396544.0, + "77": 964395008.0, + "78": 964386816.0, + "79": 964391936.0, + "80": 964388864.0, + "81": 964390400.0, + "82": 964391936.0, + "83": 964390912.0, + "84": 964388352.0, + "85": 964391424.0, + "86": 964390912.0, + "87": 964393984.0, + "88": 964390400.0, + "89": 964391424.0, + "90": 964391936.0, + "91": 964391424.0, + "92": 964391424.0, + "93": 964391936.0, + "94": 964392448.0, + "95": 964391936.0, + "96": 964388352.0, + "97": 964390400.0, + "98": 964392448.0, + "99": 964390912.0, + "100": 964389888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2512260096.0, + "2": 2777459200.0, + "3": 2777459200.0, + "4": 2777459200.0, + "5": 2777459200.0, + "6": 2778216960.0, + "7": 2778216960.0, + "8": 2778216960.0, + "9": 2778216960.0, + "10": 2778216960.0, + "11": 2778216960.0, + "12": 2778216960.0, + "13": 2778216960.0, + "14": 2778216960.0, + "15": 2778216960.0, + "16": 2778216960.0, + "17": 2778216960.0, + "18": 2778216960.0, + "19": 2778216960.0, + "20": 2778216960.0, + "21": 2778216960.0, + "22": 2778216960.0, + "23": 2778216960.0, + "24": 2778216960.0, + "25": 2778216960.0, + "26": 2778216960.0, + "27": 2778216960.0, + "28": 2778216960.0, + "29": 2778216960.0, + "30": 2778216960.0, + "31": 2778216960.0, + "32": 2778216960.0, + "33": 2778216960.0, + "34": 2778216960.0, + "35": 2778216960.0, + "36": 2778216960.0, + "37": 2778216960.0, + "38": 2778216960.0, + "39": 2778216960.0, + "40": 2778216960.0, + "41": 2778216960.0, + "42": 2778216960.0, + "43": 2778216960.0, + "44": 2778216960.0, + "45": 2778216960.0, + "46": 2778216960.0, + "47": 2778216960.0, + "48": 2778216960.0, + "49": 2778216960.0, + "50": 2778216960.0, + "51": 2778216960.0, + "52": 2778216960.0, + "53": 2778216960.0, + "54": 2778216960.0, + "55": 2778216960.0, + "56": 2778216960.0, + "57": 2778216960.0, + "58": 2778216960.0, + "59": 2778216960.0, + "60": 2778216960.0, + "61": 2778216960.0, + "62": 2778216960.0, + "63": 2778216960.0, + "64": 2778216960.0, + "65": 2778216960.0, + "66": 2778216960.0, + "67": 2778216960.0, + "68": 2778216960.0, + "69": 2778216960.0, + "70": 2778216960.0, + "71": 2778216960.0, + "72": 2778216960.0, + "73": 2778216960.0, + "74": 2778216960.0, + "75": 2778216960.0, + "76": 2778216960.0, + "77": 2778216960.0, + "78": 2778216960.0, + "79": 2778216960.0, + "80": 2778216960.0, + "81": 2778216960.0, + "82": 2778216960.0, + "83": 2778216960.0, + "84": 2778216960.0, + "85": 2778216960.0, + "86": 2778216960.0, + "87": 2778216960.0, + "88": 2778216960.0, + "89": 2778216960.0, + "90": 2778216960.0, + "91": 2778216960.0, + "92": 2778216960.0, + "93": 2778216960.0, + "94": 2778216960.0, + "95": 2778216960.0, + "96": 2778216960.0, + "97": 2778216960.0, + "98": 2778216960.0, + "99": 2778216960.0, + "100": 2778216960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 6.33917, + "3": 0.2744, + "4": 0.25641, + "5": 0.24878, + "6": 0.24537, + "7": 0.2438, + "8": 0.23878, + "9": 0.24702, + "10": 0.23746, + "11": 0.23512, + "12": 0.22956, + "13": 0.2347, + "14": 0.23605, + "15": 0.24017, + "16": 0.23204, + "17": 0.2327, + "18": 0.23371, + "19": 0.23278, + "20": 0.23324, + "21": 0.2375, + "22": 0.2357, + "23": 0.23341, + "24": 0.23508, + "25": 0.23292, + "26": 0.23763, + "27": 0.23487, + "28": 0.23071, + "29": 0.23154, + "30": 0.23464, + "31": 0.23829, + "32": 0.22989, + "33": 0.23328, + "34": 0.23409, + "35": 0.23024, + "36": 0.23774, + "37": 0.23416, + "38": 0.23657, + "39": 0.23087, + "40": 0.23163, + "41": 0.23724, + "42": 0.23245, + "43": 0.23545, + "44": 0.23041, + "45": 0.23512, + "46": 0.23935, + "47": 0.23571, + "48": 0.2329, + "49": 0.25544, + "50": 0.22697, + "51": 0.27515, + "52": 0.69001, + "53": 0.24129, + "54": 0.23155, + "55": 0.24045, + "56": 0.24512, + "57": 0.24802, + "58": 0.23433, + "59": 0.3274, + "60": 0.23221, + "61": 0.23713, + "62": 0.24042, + "63": 0.25806, + "64": 0.2355, + "65": 0.27386, + "66": 0.68273, + "67": 0.30343, + "68": 0.26428, + "69": 0.25274, + "70": 0.24031, + "71": 0.25644, + "72": 0.24947, + "73": 0.2737, + "74": 0.26515, + "75": 0.25101, + "76": 0.27258, + "77": 0.65643, + "78": 0.25055, + "79": 0.26819, + "80": 0.24291, + "81": 0.24807, + "82": 0.24385, + "83": 0.24932, + "84": 0.24366, + "85": 0.25449, + "86": 0.28807, + "87": 0.25052, + "88": 0.25388, + "89": 0.24876, + "90": 0.24712, + "91": 0.27209, + "92": 0.25942, + "93": 0.26516, + "94": 0.27795, + "95": 0.25093, + "96": 0.58451, + "97": 0.26354, + "98": 0.24591, + "99": 0.2477, + "100": 0.24515 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..3dd007cc9ec --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.7915, + "2": 10.8072, + "3": 10.79214, + "4": 10.77571, + "5": 10.82495, + "6": 10.83191, + "7": 10.82037, + "8": 10.81565, + "9": 10.81976, + "10": 10.7695, + "11": 10.8454, + "12": 10.83063, + "13": 10.83651, + "14": 10.85696, + "15": 10.80631, + "16": 10.78763, + "17": 10.75856, + "18": 10.79234, + "19": 10.78331, + "20": 10.73181, + "21": 10.71017, + "22": 10.57574, + "23": 10.71599, + "24": 10.62049, + "25": 10.58266, + "26": 10.61764, + "27": 10.65105, + "28": 10.63303, + "29": 10.63022, + "30": 10.44254, + "31": 10.20049, + "32": 10.52014, + "33": 10.50814, + "34": 10.29535, + "35": 10.33643, + "36": 10.30247, + "37": 10.41766, + "38": 10.28067, + "39": 10.46149, + "40": 10.18213, + "41": 10.21349, + "42": 10.28426, + "43": 9.9557, + "44": 10.05793, + "45": 9.9574, + "46": 9.93571, + "47": 10.22719, + "48": 9.96561, + "49": 9.66581, + "50": 10.00922, + "51": 9.94826, + "52": 9.84653, + "53": 10.14876, + "54": 10.03737, + "55": 9.97454, + "56": 9.71384, + "57": 9.5955, + "58": 9.92044, + "59": 9.67604, + "60": 9.61264, + "61": 9.79194, + "62": 10.05699, + "63": 9.47838, + "64": 9.84479, + "65": 9.03861, + "66": 9.78386, + "67": 9.43595, + "68": 9.85188, + "69": 9.84445, + "70": 9.79288, + "71": 9.69163, + "72": 9.64893, + "73": 9.55502, + "74": 9.04736, + "75": 9.49186, + "76": 9.17766, + "77": 10.11289, + "78": 9.7687, + "79": 9.43966, + "80": 9.45416, + "81": 9.53142, + "82": 9.7541, + "83": 9.38201, + "84": 9.46121, + "85": 9.66928, + "86": 9.13531, + "87": 9.63413, + "88": 9.8011, + "89": 9.66658, + "90": 9.86173, + "91": 9.39963, + "92": 9.41066, + "93": 9.14665, + "94": 8.8869, + "95": 9.56959, + "96": 9.57609, + "97": 9.34309, + "98": 9.72749, + "99": 8.96222, + "100": 9.44903 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 31217.0, + "2": 33106.0, + "3": 33596.0, + "4": 31946.0, + "5": 36783.0, + "6": 37252.0, + "7": 35314.0, + "8": 31970.0, + "9": 34937.0, + "10": 29900.0, + "11": 38039.0, + "12": 34886.0, + "13": 37108.0, + "14": 37755.0, + "15": 35069.0, + "16": 36687.0, + "17": 34887.0, + "18": 35219.0, + "19": 35710.0, + "20": 32682.0, + "21": 33456.0, + "22": 30216.0, + "23": 37780.0, + "24": 32298.0, + "25": 30789.0, + "26": 34549.0, + "27": 35611.0, + "28": 36806.0, + "29": 37955.0, + "30": 32950.0, + "31": 30468.0, + "32": 36291.0, + "33": 37916.0, + "34": 32820.0, + "35": 34371.0, + "36": 34957.0, + "37": 38282.0, + "38": 35878.0, + "39": 38974.0, + "40": 36048.0, + "41": 35988.0, + "42": 37320.0, + "43": 33909.0, + "44": 33889.0, + "45": 35577.0, + "46": 37076.0, + "47": 40966.0, + "48": 35327.0, + "49": 34682.0, + "50": 39871.0, + "51": 36802.0, + "52": 36445.0, + "53": 41968.0, + "54": 40797.0, + "55": 36920.0, + "56": 40345.0, + "57": 36961.0, + "58": 41622.0, + "59": 37988.0, + "60": 40534.0, + "61": 40456.0, + "62": 43543.0, + "63": 37438.0, + "64": 42659.0, + "65": 39924.0, + "66": 44122.0, + "67": 40136.0, + "68": 40005.0, + "69": 41675.0, + "70": 45011.0, + "71": 40746.0, + "72": 41647.0, + "73": 44080.0, + "74": 35412.0, + "75": 39478.0, + "76": 46254.0, + "77": 44764.0, + "78": 47985.0, + "79": 48646.0, + "80": 46686.0, + "81": 50102.0, + "82": 50188.0, + "83": 44717.0, + "84": 46114.0, + "85": 49347.0, + "86": 45770.0, + "87": 49671.0, + "88": 46449.0, + "89": 49666.0, + "90": 51087.0, + "91": 45827.0, + "92": 48163.0, + "93": 46547.0, + "94": 47562.0, + "95": 48540.0, + "96": 50182.0, + "97": 46055.0, + "98": 50271.0, + "99": 48494.0, + "100": 45373.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 892874752.0, + "2": 892866560.0, + "3": 892869120.0, + "4": 892876800.0, + "5": 892869120.0, + "6": 892870656.0, + "7": 892874240.0, + "8": 892868608.0, + "9": 892869632.0, + "10": 892868608.0, + "11": 892869632.0, + "12": 892867072.0, + "13": 892872192.0, + "14": 892873216.0, + "15": 892870656.0, + "16": 892868608.0, + "17": 892879360.0, + "18": 892867072.0, + "19": 892870656.0, + "20": 892867072.0, + "21": 892871168.0, + "22": 892874752.0, + "23": 892877824.0, + "24": 892869120.0, + "25": 892877312.0, + "26": 892873216.0, + "27": 892865024.0, + "28": 892870144.0, + "29": 892869632.0, + "30": 892871680.0, + "31": 892881920.0, + "32": 892874752.0, + "33": 892870144.0, + "34": 892872192.0, + "35": 892874240.0, + "36": 892869632.0, + "37": 892868096.0, + "38": 892867072.0, + "39": 892871168.0, + "40": 892869120.0, + "41": 892873728.0, + "42": 892868608.0, + "43": 892871168.0, + "44": 892871680.0, + "45": 892869632.0, + "46": 892876800.0, + "47": 892869632.0, + "48": 892875264.0, + "49": 892872704.0, + "50": 892869120.0, + "51": 892872192.0, + "52": 892875776.0, + "53": 892868096.0, + "54": 892872192.0, + "55": 892867072.0, + "56": 892865024.0, + "57": 892876288.0, + "58": 892869120.0, + "59": 892871680.0, + "60": 892869120.0, + "61": 892869120.0, + "62": 892869632.0, + "63": 892870656.0, + "64": 892865536.0, + "65": 892872192.0, + "66": 892864512.0, + "67": 892862464.0, + "68": 892867584.0, + "69": 892861952.0, + "70": 892867072.0, + "71": 892870656.0, + "72": 892862464.0, + "73": 892861440.0, + "74": 892849664.0, + "75": 892868096.0, + "76": 892869632.0, + "77": 892868096.0, + "78": 892859392.0, + "79": 892865024.0, + "80": 892855296.0, + "81": 892856320.0, + "82": 892860416.0, + "83": 892869632.0, + "84": 892852736.0, + "85": 892871680.0, + "86": 892861952.0, + "87": 892869120.0, + "88": 892869632.0, + "89": 892859392.0, + "90": 892867072.0, + "91": 892865536.0, + "92": 892865536.0, + "93": 892861440.0, + "94": 892860928.0, + "95": 892869120.0, + "96": 892866560.0, + "97": 892856320.0, + "98": 892869120.0, + "99": 892864512.0, + "100": 892864000.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1948027904.0, + "2": 2183897088.0, + "3": 2184431104.0, + "4": 2190589952.0, + "5": 2190589952.0, + "6": 2190589952.0, + "7": 2190589952.0, + "8": 2190589952.0, + "9": 2190589952.0, + "10": 2190589952.0, + "11": 2190589952.0, + "12": 2190589952.0, + "13": 2190589952.0, + "14": 2190589952.0, + "15": 2190589952.0, + "16": 2190589952.0, + "17": 2194543104.0, + "18": 2194543104.0, + "19": 2194543104.0, + "20": 2194543104.0, + "21": 2194543104.0, + "22": 2194543104.0, + "23": 2194543104.0, + "24": 2194543104.0, + "25": 2194543104.0, + "26": 2194543104.0, + "27": 2194543104.0, + "28": 2194543104.0, + "29": 2194543104.0, + "30": 2194543104.0, + "31": 2195852288.0, + "32": 2195852288.0, + "33": 2195852288.0, + "34": 2195852288.0, + "35": 2195852288.0, + "36": 2195852288.0, + "37": 2195852288.0, + "38": 2195852288.0, + "39": 2195852288.0, + "40": 2195852288.0, + "41": 2195852288.0, + "42": 2195852288.0, + "43": 2195852288.0, + "44": 2195852288.0, + "45": 2195852288.0, + "46": 2195852288.0, + "47": 2195852288.0, + "48": 2195852288.0, + "49": 2195852288.0, + "50": 2195852288.0, + "51": 2195852288.0, + "52": 2195852288.0, + "53": 2195852288.0, + "54": 2195852288.0, + "55": 2195852288.0, + "56": 2195852288.0, + "57": 2195852288.0, + "58": 2195852288.0, + "59": 2195852288.0, + "60": 2195852288.0, + "61": 2195852288.0, + "62": 2195852288.0, + "63": 2195852288.0, + "64": 2195852288.0, + "65": 2195852288.0, + "66": 2195852288.0, + "67": 2195852288.0, + "68": 2195852288.0, + "69": 2195852288.0, + "70": 2195852288.0, + "71": 2195852288.0, + "72": 2195852288.0, + "73": 2195852288.0, + "74": 2195852288.0, + "75": 2195852288.0, + "76": 2195852288.0, + "77": 2195852288.0, + "78": 2195852288.0, + "79": 2195852288.0, + "80": 2195852288.0, + "81": 2195852288.0, + "82": 2195852288.0, + "83": 2195852288.0, + "84": 2195852288.0, + "85": 2195852288.0, + "86": 2195852288.0, + "87": 2195852288.0, + "88": 2195852288.0, + "89": 2195852288.0, + "90": 2195852288.0, + "91": 2195852288.0, + "92": 2195852288.0, + "93": 2195852288.0, + "94": 2195852288.0, + "95": 2195852288.0, + "96": 2195852288.0, + "97": 2195852288.0, + "98": 2195852288.0, + "99": 2195852288.0, + "100": 2195852288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 3.86481, + "3": 0.3588, + "4": 0.35276, + "5": 0.33575, + "6": 0.3344, + "7": 0.3406, + "8": 0.33551, + "9": 0.33157, + "10": 0.32814, + "11": 0.32882, + "12": 0.3298, + "13": 0.32887, + "14": 0.32898, + "15": 0.33409, + "16": 0.32679, + "17": 0.34317, + "18": 0.33153, + "19": 0.32828, + "20": 0.33077, + "21": 0.32713, + "22": 0.32603, + "23": 0.32819, + "24": 0.33158, + "25": 0.32832, + "26": 0.32593, + "27": 0.33086, + "28": 0.32481, + "29": 0.32607, + "30": 0.33032, + "31": 0.33561, + "32": 0.33149, + "33": 0.32643, + "34": 0.34262, + "35": 0.32889, + "36": 0.32749, + "37": 0.32097, + "38": 0.33036, + "39": 0.69454, + "40": 0.33723, + "41": 0.3284, + "42": 0.32735, + "43": 0.33334, + "44": 0.3333, + "45": 0.33315, + "46": 0.33505, + "47": 0.32976, + "48": 0.32918, + "49": 0.34661, + "50": 0.32681, + "51": 0.3427, + "52": 0.3299, + "53": 0.32454, + "54": 0.3251, + "55": 0.32968, + "56": 0.34696, + "57": 0.33819, + "58": 0.32649, + "59": 0.3341, + "60": 0.33324, + "61": 0.33925, + "62": 0.33532, + "63": 0.34334, + "64": 0.34963, + "65": 0.38392, + "66": 0.33805, + "67": 0.3728, + "68": 0.33745, + "69": 0.33504, + "70": 0.33581, + "71": 0.35385, + "72": 0.34934, + "73": 0.34952, + "74": 0.35756, + "75": 0.35105, + "76": 0.34933, + "77": 0.33518, + "78": 0.34556, + "79": 0.34603, + "80": 0.36355, + "81": 0.34186, + "82": 0.34271, + "83": 0.39765, + "84": 0.36927, + "85": 0.33938, + "86": 0.35142, + "87": 0.34329, + "88": 0.33135, + "89": 0.34535, + "90": 0.33856, + "91": 0.3522, + "92": 0.33934, + "93": 0.38169, + "94": 0.36358, + "95": 0.33846, + "96": 0.33554, + "97": 0.34438, + "98": 0.32586, + "99": 0.43185, + "100": 0.33974 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..31167be6de5 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.7915, + "2": 10.8072, + "3": 10.79214, + "4": 10.77571, + "5": 10.82495, + "6": 10.83193, + "7": 10.82077, + "8": 10.81496, + "9": 10.81973, + "10": 10.7692, + "11": 10.84519, + "12": 10.83101, + "13": 10.83652, + "14": 10.85771, + "15": 10.80581, + "16": 10.78733, + "17": 10.75844, + "18": 10.79297, + "19": 10.78295, + "20": 10.73199, + "21": 10.70953, + "22": 10.57675, + "23": 10.71651, + "24": 10.61983, + "25": 10.58207, + "26": 10.61694, + "27": 10.6509, + "28": 10.63261, + "29": 10.63024, + "30": 10.4432, + "31": 10.19983, + "32": 10.52048, + "33": 10.5079, + "34": 10.29565, + "35": 10.33536, + "36": 10.30278, + "37": 10.41788, + "38": 10.28121, + "39": 10.46185, + "40": 10.18169, + "41": 10.21391, + "42": 10.28457, + "43": 9.95538, + "44": 10.05751, + "45": 9.95713, + "46": 9.93528, + "47": 10.22675, + "48": 9.96521, + "49": 9.66603, + "50": 10.009, + "51": 9.94789, + "52": 9.84665, + "53": 10.14887, + "54": 10.03772, + "55": 9.97445, + "56": 9.71378, + "57": 9.59509, + "58": 9.92081, + "59": 9.67609, + "60": 9.61253, + "61": 9.79221, + "62": 10.05653, + "63": 9.47849, + "64": 9.84455, + "65": 9.03889, + "66": 9.78399, + "67": 9.43609, + "68": 9.85203, + "69": 9.84438, + "70": 9.7933, + "71": 9.69163, + "72": 9.64909, + "73": 9.55528, + "74": 9.04743, + "75": 9.49185, + "76": 9.178, + "77": 10.11275, + "78": 9.76838, + "79": 9.4398, + "80": 9.45421, + "81": 9.53191, + "82": 9.75402, + "83": 9.38186, + "84": 9.46162, + "85": 9.66959, + "86": 9.1349, + "87": 9.6343, + "88": 9.80083, + "89": 9.66682, + "90": 9.86175, + "91": 9.39987, + "92": 9.41063, + "93": 9.14654, + "94": 8.88648, + "95": 9.56986, + "96": 9.57642, + "97": 9.34305, + "98": 9.72786, + "99": 8.96203, + "100": 9.44942 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 31217.0, + "2": 33106.0, + "3": 33596.0, + "4": 31946.0, + "5": 36783.0, + "6": 37377.0, + "7": 35362.0, + "8": 31711.0, + "9": 34749.0, + "10": 29758.0, + "11": 38348.0, + "12": 35446.0, + "13": 37087.0, + "14": 37869.0, + "15": 35242.0, + "16": 36520.0, + "17": 35190.0, + "18": 35191.0, + "19": 35614.0, + "20": 32571.0, + "21": 33220.0, + "22": 30518.0, + "23": 37619.0, + "24": 32547.0, + "25": 30591.0, + "26": 34546.0, + "27": 35275.0, + "28": 36936.0, + "29": 37531.0, + "30": 33354.0, + "31": 30754.0, + "32": 36331.0, + "33": 38273.0, + "34": 32645.0, + "35": 34237.0, + "36": 35092.0, + "37": 37931.0, + "38": 35480.0, + "39": 39175.0, + "40": 36296.0, + "41": 35902.0, + "42": 37609.0, + "43": 33748.0, + "44": 34027.0, + "45": 35215.0, + "46": 37108.0, + "47": 41056.0, + "48": 35765.0, + "49": 35087.0, + "50": 39734.0, + "51": 36712.0, + "52": 36176.0, + "53": 41774.0, + "54": 40447.0, + "55": 37071.0, + "56": 39975.0, + "57": 36828.0, + "58": 41815.0, + "59": 37962.0, + "60": 40415.0, + "61": 39921.0, + "62": 43840.0, + "63": 37890.0, + "64": 42699.0, + "65": 40347.0, + "66": 44159.0, + "67": 40057.0, + "68": 39563.0, + "69": 42246.0, + "70": 44867.0, + "71": 40910.0, + "72": 40982.0, + "73": 44363.0, + "74": 35672.0, + "75": 39602.0, + "76": 46157.0, + "77": 44919.0, + "78": 48134.0, + "79": 48666.0, + "80": 46770.0, + "81": 50144.0, + "82": 49680.0, + "83": 44991.0, + "84": 45912.0, + "85": 49371.0, + "86": 45600.0, + "87": 49292.0, + "88": 46411.0, + "89": 49710.0, + "90": 51008.0, + "91": 45796.0, + "92": 47991.0, + "93": 46847.0, + "94": 47360.0, + "95": 48680.0, + "96": 50369.0, + "97": 46162.0, + "98": 49921.0, + "99": 48235.0, + "100": 45390.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1254511616.0, + "2": 1254503424.0, + "3": 1254505984.0, + "4": 1254513664.0, + "5": 1254505984.0, + "6": 1254507520.0, + "7": 1254511104.0, + "8": 1254505472.0, + "9": 1254505984.0, + "10": 1254506496.0, + "11": 1254507520.0, + "12": 1254503936.0, + "13": 1254509568.0, + "14": 1254510080.0, + "15": 1254506496.0, + "16": 1254505984.0, + "17": 1254516224.0, + "18": 1254503424.0, + "19": 1254506496.0, + "20": 1254504960.0, + "21": 1254508032.0, + "22": 1254510592.0, + "23": 1254512640.0, + "24": 1254505472.0, + "25": 1254513664.0, + "26": 1254512128.0, + "27": 1254501888.0, + "28": 1254509056.0, + "29": 1254508032.0, + "30": 1254509056.0, + "31": 1254519296.0, + "32": 1254512128.0, + "33": 1254507008.0, + "34": 1254509056.0, + "35": 1254510080.0, + "36": 1254507008.0, + "37": 1254504448.0, + "38": 1254505472.0, + "39": 1254508032.0, + "40": 1254505984.0, + "41": 1254512128.0, + "42": 1254504960.0, + "43": 1254507008.0, + "44": 1254508032.0, + "45": 1254506496.0, + "46": 1254513664.0, + "47": 1254507008.0, + "48": 1254511616.0, + "49": 1254508032.0, + "50": 1254506496.0, + "51": 1254508032.0, + "52": 1254513152.0, + "53": 1254505984.0, + "54": 1254508544.0, + "55": 1254503936.0, + "56": 1254502912.0, + "57": 1254515200.0, + "58": 1254503936.0, + "59": 1254508544.0, + "60": 1254503936.0, + "61": 1254507008.0, + "62": 1254508032.0, + "63": 1254507520.0, + "64": 1254502400.0, + "65": 1254509568.0, + "66": 1254501376.0, + "67": 1254499328.0, + "68": 1254503936.0, + "69": 1254499328.0, + "70": 1254502912.0, + "71": 1254507520.0, + "72": 1254499328.0, + "73": 1254497280.0, + "74": 1254486016.0, + "75": 1254504960.0, + "76": 1254507008.0, + "77": 1254504448.0, + "78": 1254496256.0, + "79": 1254500864.0, + "80": 1254491648.0, + "81": 1254493696.0, + "82": 1254497280.0, + "83": 1254505984.0, + "84": 1254489600.0, + "85": 1254505984.0, + "86": 1254500352.0, + "87": 1254505472.0, + "88": 1254506496.0, + "89": 1254498304.0, + "90": 1254504448.0, + "91": 1254501888.0, + "92": 1254501888.0, + "93": 1254499328.0, + "94": 1254494720.0, + "95": 1254504960.0, + "96": 1254503424.0, + "97": 1254492672.0, + "98": 1254505984.0, + "99": 1254499328.0, + "100": 1254501888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2066381824.0, + "2": 2543623168.0, + "3": 2544637440.0, + "4": 2550310912.0, + "5": 2550310912.0, + "6": 2550310912.0, + "7": 2550310912.0, + "8": 2550310912.0, + "9": 2550310912.0, + "10": 2550310912.0, + "11": 2550310912.0, + "12": 2550310912.0, + "13": 2550310912.0, + "14": 2550310912.0, + "15": 2550310912.0, + "16": 2550310912.0, + "17": 2554848768.0, + "18": 2554848768.0, + "19": 2554848768.0, + "20": 2554848768.0, + "21": 2554848768.0, + "22": 2554848768.0, + "23": 2554848768.0, + "24": 2554848768.0, + "25": 2554848768.0, + "26": 2554848768.0, + "27": 2554848768.0, + "28": 2554848768.0, + "29": 2554848768.0, + "30": 2554848768.0, + "31": 2556047872.0, + "32": 2556047872.0, + "33": 2556047872.0, + "34": 2556047872.0, + "35": 2556047872.0, + "36": 2556047872.0, + "37": 2556047872.0, + "38": 2556047872.0, + "39": 2556047872.0, + "40": 2556047872.0, + "41": 2556047872.0, + "42": 2556047872.0, + "43": 2556047872.0, + "44": 2556047872.0, + "45": 2556047872.0, + "46": 2556047872.0, + "47": 2556047872.0, + "48": 2556047872.0, + "49": 2556047872.0, + "50": 2556047872.0, + "51": 2556047872.0, + "52": 2556047872.0, + "53": 2556047872.0, + "54": 2556047872.0, + "55": 2556047872.0, + "56": 2556047872.0, + "57": 2556047872.0, + "58": 2556047872.0, + "59": 2556047872.0, + "60": 2556047872.0, + "61": 2556047872.0, + "62": 2556047872.0, + "63": 2556047872.0, + "64": 2556047872.0, + "65": 2556047872.0, + "66": 2556047872.0, + "67": 2556047872.0, + "68": 2556047872.0, + "69": 2556047872.0, + "70": 2556047872.0, + "71": 2556047872.0, + "72": 2556047872.0, + "73": 2556047872.0, + "74": 2556047872.0, + "75": 2556047872.0, + "76": 2556047872.0, + "77": 2556047872.0, + "78": 2556047872.0, + "79": 2556047872.0, + "80": 2556047872.0, + "81": 2556047872.0, + "82": 2556047872.0, + "83": 2556047872.0, + "84": 2556047872.0, + "85": 2556047872.0, + "86": 2556047872.0, + "87": 2556047872.0, + "88": 2556047872.0, + "89": 2556047872.0, + "90": 2556047872.0, + "91": 2556047872.0, + "92": 2556047872.0, + "93": 2556047872.0, + "94": 2556047872.0, + "95": 2556047872.0, + "96": 2556047872.0, + "97": 2556047872.0, + "98": 2556047872.0, + "99": 2556047872.0, + "100": 2556047872.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 4.80614, + "3": 0.71249, + "4": 0.40839, + "5": 0.39358, + "6": 0.3911, + "7": 0.39032, + "8": 0.38318, + "9": 0.70382, + "10": 0.34707, + "11": 0.34403, + "12": 0.34043, + "13": 0.33959, + "14": 0.33461, + "15": 0.34767, + "16": 0.33495, + "17": 0.34839, + "18": 0.33673, + "19": 0.33335, + "20": 0.33161, + "21": 0.32643, + "22": 0.33565, + "23": 0.33625, + "24": 0.33009, + "25": 0.33065, + "26": 0.33344, + "27": 0.33552, + "28": 0.33047, + "29": 0.33011, + "30": 0.33358, + "31": 0.34631, + "32": 0.33536, + "33": 0.33271, + "34": 0.33949, + "35": 0.33073, + "36": 0.32877, + "37": 0.32806, + "38": 0.33111, + "39": 0.33408, + "40": 0.33428, + "41": 0.34927, + "42": 1.47745, + "43": 0.48012, + "44": 0.33077, + "45": 0.33262, + "46": 0.34066, + "47": 0.33152, + "48": 0.33512, + "49": 0.34429, + "50": 0.33697, + "51": 0.34656, + "52": 0.337, + "53": 0.33133, + "54": 0.33172, + "55": 0.33188, + "56": 0.35163, + "57": 0.34162, + "58": 0.33258, + "59": 0.7122, + "60": 0.33979, + "61": 0.33569, + "62": 0.33523, + "63": 0.33864, + "64": 0.34776, + "65": 0.37658, + "66": 0.3377, + "67": 0.36916, + "68": 0.3452, + "69": 0.33854, + "70": 0.34023, + "71": 0.3544, + "72": 0.34395, + "73": 0.3567, + "74": 0.35025, + "75": 0.35164, + "76": 0.35012, + "77": 0.3364, + "78": 0.34491, + "79": 0.34789, + "80": 0.35388, + "81": 0.34075, + "82": 0.34743, + "83": 0.34211, + "84": 0.34722, + "85": 0.33956, + "86": 0.35402, + "87": 0.34301, + "88": 0.34056, + "89": 0.35764, + "90": 0.33476, + "91": 0.3539, + "92": 0.34448, + "93": 0.34895, + "94": 0.3624, + "95": 0.34001, + "96": 0.3382, + "97": 0.35217, + "98": 0.33252, + "99": 0.34909, + "100": 0.34966 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..ce3d79128b1 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80509, + "2": 10.82386, + "3": 10.80196, + "4": 10.79424, + "5": 10.8277, + "6": 10.84005, + "7": 10.8365, + "8": 10.82828, + "9": 10.83477, + "10": 10.77496, + "11": 10.85204, + "12": 10.83903, + "13": 10.85207, + "14": 10.85914, + "15": 10.81681, + "16": 10.79456, + "17": 10.77491, + "18": 10.80399, + "19": 10.79956, + "20": 10.73801, + "21": 10.72487, + "22": 10.59177, + "23": 10.73098, + "24": 10.6406, + "25": 10.59018, + "26": 10.63555, + "27": 10.66245, + "28": 10.6472, + "29": 10.64163, + "30": 10.4518, + "31": 10.22249, + "32": 10.52995, + "33": 10.51998, + "34": 10.31247, + "35": 10.34796, + "36": 10.31677, + "37": 10.42804, + "38": 10.29194, + "39": 10.46881, + "40": 10.19257, + "41": 10.23159, + "42": 10.29766, + "43": 9.97363, + "44": 10.07169, + "45": 9.97015, + "46": 9.94713, + "47": 10.23179, + "48": 9.97593, + "49": 9.67748, + "50": 10.0144 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31182.0, + "2": 33013.0, + "3": 33646.0, + "4": 32202.0, + "5": 36913.0, + "6": 37554.0, + "7": 35184.0, + "8": 32207.0, + "9": 34523.0, + "10": 29945.0, + "11": 38237.0, + "12": 35346.0, + "13": 37426.0, + "14": 38358.0, + "15": 35140.0, + "16": 36293.0, + "17": 35645.0, + "18": 35117.0, + "19": 35648.0, + "20": 32896.0, + "21": 33511.0, + "22": 30704.0, + "23": 38149.0, + "24": 32677.0, + "25": 31055.0, + "26": 34700.0, + "27": 35410.0, + "28": 37268.0, + "29": 37953.0, + "30": 33210.0, + "31": 30482.0, + "32": 36908.0, + "33": 38308.0, + "34": 33125.0, + "35": 34341.0, + "36": 34925.0, + "37": 38767.0, + "38": 35780.0, + "39": 38955.0, + "40": 36485.0, + "41": 36015.0, + "42": 37638.0, + "43": 33689.0, + "44": 33688.0, + "45": 35448.0, + "46": 36810.0, + "47": 40858.0, + "48": 35696.0, + "49": 34729.0, + "50": 39077.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027095040.0, + "2": 1027093504.0, + "3": 1027094528.0, + "4": 1027095040.0, + "5": 1027091968.0, + "6": 1027091968.0, + "7": 1027098112.0, + "8": 1027097600.0, + "9": 1027094528.0, + "10": 1027094016.0, + "11": 1027098624.0, + "12": 1027094528.0, + "13": 1027092480.0, + "14": 1027095040.0, + "15": 1027095040.0, + "16": 1027091456.0, + "17": 1027101184.0, + "18": 1027096064.0, + "19": 1027093504.0, + "20": 1027093504.0, + "21": 1027097088.0, + "22": 1027100160.0, + "23": 1027100160.0, + "24": 1027095552.0, + "25": 1027097088.0, + "26": 1027098112.0, + "27": 1027091456.0, + "28": 1027090944.0, + "29": 1027091968.0, + "30": 1027099648.0, + "31": 1027109888.0, + "32": 1027095552.0, + "33": 1027090944.0, + "34": 1027098112.0, + "35": 1027103744.0, + "36": 1027098112.0, + "37": 1027092480.0, + "38": 1027091456.0, + "39": 1027095040.0, + "40": 1027095040.0, + "41": 1027100160.0, + "42": 1027091968.0, + "43": 1027098624.0, + "44": 1027098624.0, + "45": 1027096064.0, + "46": 1027104256.0, + "47": 1027093504.0, + "48": 1027101184.0, + "49": 1027096064.0, + "50": 1027095552.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3059586560.0, + "2": 3299159040.0, + "3": 3299482112.0, + "4": 3302137344.0, + "5": 3302137344.0, + "6": 3302137344.0, + "7": 3303535104.0, + "8": 3303535104.0, + "9": 3303535104.0, + "10": 3303535104.0, + "11": 3303535104.0, + "12": 3303535104.0, + "13": 3303535104.0, + "14": 3303535104.0, + "15": 3303535104.0, + "16": 3303535104.0, + "17": 3306910208.0, + "18": 3306910208.0, + "19": 3306910208.0, + "20": 3306910208.0, + "21": 3306910208.0, + "22": 3306910208.0, + "23": 3306910208.0, + "24": 3306910208.0, + "25": 3306910208.0, + "26": 3306910208.0, + "27": 3306910208.0, + "28": 3306910208.0, + "29": 3306910208.0, + "30": 3306910208.0, + "31": 3312495616.0, + "32": 3312495616.0, + "33": 3312495616.0, + "34": 3312495616.0, + "35": 3312495616.0, + "36": 3312495616.0, + "37": 3312495616.0, + "38": 3312495616.0, + "39": 3312495616.0, + "40": 3312495616.0, + "41": 3312495616.0, + "42": 3312495616.0, + "43": 3312495616.0, + "44": 3312495616.0, + "45": 3312495616.0, + "46": 3312495616.0, + "47": 3312495616.0, + "48": 3312495616.0, + "49": 3312495616.0, + "50": 3312495616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 3.82645, + "3": 0.34371, + "4": 0.32704, + "5": 0.31536, + "6": 0.32001, + "7": 0.31919, + "8": 0.31719, + "9": 0.31876, + "10": 0.31015, + "11": 0.31546, + "12": 0.31198, + "13": 0.31518, + "14": 0.40567, + "15": 0.31856, + "16": 0.30868, + "17": 0.31352, + "18": 0.31536, + "19": 0.31164, + "20": 0.31286, + "21": 0.35519, + "22": 0.30985, + "23": 0.31256, + "24": 0.31727, + "25": 0.36651, + "26": 0.47287, + "27": 0.57438, + "28": 0.3575, + "29": 0.71431, + "30": 0.31163, + "31": 0.31877, + "32": 0.34436, + "33": 0.51773, + "34": 0.32292, + "35": 0.31651, + "36": 0.34162, + "37": 0.31339, + "38": 0.30524, + "39": 0.63856, + "40": 0.31883, + "41": 0.31475, + "42": 0.67365, + "43": 0.33393, + "44": 0.31389, + "45": 0.65089, + "46": 0.6524, + "47": 0.3061, + "48": 0.30487, + "49": 0.3295, + "50": 0.30784 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..f62929eef31 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80509, + "2": 10.82386, + "3": 10.80196, + "4": 10.79424, + "5": 10.8277, + "6": 10.84005, + "7": 10.8365, + "8": 10.82828, + "9": 10.83477, + "10": 10.77496, + "11": 10.85204, + "12": 10.83903, + "13": 10.85207, + "14": 10.85914, + "15": 10.81681, + "16": 10.79456, + "17": 10.77491, + "18": 10.80399, + "19": 10.79956, + "20": 10.73801, + "21": 10.72487, + "22": 10.59177, + "23": 10.73098, + "24": 10.6406, + "25": 10.59018, + "26": 10.63555, + "27": 10.66245, + "28": 10.6472, + "29": 10.64163, + "30": 10.4518, + "31": 10.22249, + "32": 10.52995, + "33": 10.51998, + "34": 10.31247, + "35": 10.34796, + "36": 10.31677, + "37": 10.42804, + "38": 10.29194, + "39": 10.46881, + "40": 10.19257, + "41": 10.23159, + "42": 10.29766, + "43": 9.97363, + "44": 10.07169, + "45": 9.97015, + "46": 9.94713, + "47": 10.23179, + "48": 9.97593, + "49": 9.67748, + "50": 10.0144 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31182.0, + "2": 33013.0, + "3": 33646.0, + "4": 32202.0, + "5": 36913.0, + "6": 37554.0, + "7": 35184.0, + "8": 32207.0, + "9": 34523.0, + "10": 29945.0, + "11": 38237.0, + "12": 35346.0, + "13": 37426.0, + "14": 38358.0, + "15": 35140.0, + "16": 36293.0, + "17": 35645.0, + "18": 35117.0, + "19": 35648.0, + "20": 32896.0, + "21": 33511.0, + "22": 30704.0, + "23": 38149.0, + "24": 32677.0, + "25": 31055.0, + "26": 34700.0, + "27": 35410.0, + "28": 37268.0, + "29": 37953.0, + "30": 33210.0, + "31": 30482.0, + "32": 36908.0, + "33": 38308.0, + "34": 33125.0, + "35": 34341.0, + "36": 34925.0, + "37": 38767.0, + "38": 35780.0, + "39": 38955.0, + "40": 36485.0, + "41": 36015.0, + "42": 37638.0, + "43": 33689.0, + "44": 33688.0, + "45": 35448.0, + "46": 36810.0, + "47": 40858.0, + "48": 35696.0, + "49": 34729.0, + "50": 39077.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027095040.0, + "2": 1027093504.0, + "3": 1027094528.0, + "4": 1027095040.0, + "5": 1027091968.0, + "6": 1027091968.0, + "7": 1027098112.0, + "8": 1027097600.0, + "9": 1027094528.0, + "10": 1027094016.0, + "11": 1027098624.0, + "12": 1027094528.0, + "13": 1027092480.0, + "14": 1027095040.0, + "15": 1027095040.0, + "16": 1027091456.0, + "17": 1027101184.0, + "18": 1027096064.0, + "19": 1027093504.0, + "20": 1027093504.0, + "21": 1027097088.0, + "22": 1027100160.0, + "23": 1027100160.0, + "24": 1027095552.0, + "25": 1027097088.0, + "26": 1027098112.0, + "27": 1027091456.0, + "28": 1027090944.0, + "29": 1027091968.0, + "30": 1027099648.0, + "31": 1027109888.0, + "32": 1027095552.0, + "33": 1027090944.0, + "34": 1027098112.0, + "35": 1027103744.0, + "36": 1027098112.0, + "37": 1027092480.0, + "38": 1027091456.0, + "39": 1027095040.0, + "40": 1027095040.0, + "41": 1027100160.0, + "42": 1027091968.0, + "43": 1027098624.0, + "44": 1027098624.0, + "45": 1027096064.0, + "46": 1027104256.0, + "47": 1027093504.0, + "48": 1027101184.0, + "49": 1027096064.0, + "50": 1027095552.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3059586560.0, + "2": 3299159040.0, + "3": 3299482112.0, + "4": 3302137344.0, + "5": 3302137344.0, + "6": 3302137344.0, + "7": 3303535104.0, + "8": 3303535104.0, + "9": 3303535104.0, + "10": 3303535104.0, + "11": 3303535104.0, + "12": 3303535104.0, + "13": 3303535104.0, + "14": 3303535104.0, + "15": 3303535104.0, + "16": 3303535104.0, + "17": 3306910208.0, + "18": 3306910208.0, + "19": 3306910208.0, + "20": 3306910208.0, + "21": 3306910208.0, + "22": 3306910208.0, + "23": 3306910208.0, + "24": 3306910208.0, + "25": 3306910208.0, + "26": 3306910208.0, + "27": 3306910208.0, + "28": 3306910208.0, + "29": 3306910208.0, + "30": 3306910208.0, + "31": 3312495616.0, + "32": 3312495616.0, + "33": 3312495616.0, + "34": 3312495616.0, + "35": 3312495616.0, + "36": 3312495616.0, + "37": 3312495616.0, + "38": 3312495616.0, + "39": 3312495616.0, + "40": 3312495616.0, + "41": 3312495616.0, + "42": 3312495616.0, + "43": 3312495616.0, + "44": 3312495616.0, + "45": 3312495616.0, + "46": 3312495616.0, + "47": 3312495616.0, + "48": 3312495616.0, + "49": 3312495616.0, + "50": 3312495616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 3.6307, + "3": 0.3854, + "4": 0.38116, + "5": 0.36866, + "6": 0.36756, + "7": 0.37196, + "8": 0.37096, + "9": 0.36719, + "10": 0.36516, + "11": 0.36882, + "12": 0.37126, + "13": 0.36294, + "14": 0.36799, + "15": 0.3669, + "16": 0.36835, + "17": 0.37548, + "18": 0.37236, + "19": 0.36274, + "20": 0.36388, + "21": 0.36581, + "22": 0.3703, + "23": 0.36921, + "24": 0.35712, + "25": 0.36049, + "26": 0.36512, + "27": 0.36657, + "28": 0.36074, + "29": 0.41887, + "30": 0.45698, + "31": 0.54747, + "32": 0.4695, + "33": 0.67157, + "34": 0.4186, + "35": 0.39703, + "36": 0.40139, + "37": 0.39345, + "38": 0.38789, + "39": 1.0807, + "40": 0.42023, + "41": 0.3945, + "42": 0.39312, + "43": 0.41319, + "44": 0.40657, + "45": 0.4003, + "46": 0.3986, + "47": 0.38501, + "48": 0.38618, + "49": 0.38586, + "50": 0.38297 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..682fa44a64d --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82137, + "2": 10.8271, + "3": 10.81279, + "4": 10.80424, + "5": 10.84481, + "6": 10.85159, + "7": 10.82705, + "8": 10.83127, + "9": 10.8396, + "10": 10.79638, + "11": 10.85834, + "12": 10.8443, + "13": 10.8625, + "14": 10.86559, + "15": 10.8001, + "16": 10.78718, + "17": 10.7639, + "18": 10.78578, + "19": 10.78836, + "20": 10.71249, + "21": 10.68241, + "22": 10.54353, + "23": 10.69825, + "24": 10.58633, + "25": 10.52721, + "26": 10.58871, + "27": 10.60408, + "28": 10.57696, + "29": 10.57897, + "30": 10.36401, + "31": 10.10796, + "32": 10.44854, + "33": 10.4401, + "34": 10.20252, + "35": 10.25069, + "36": 10.21055, + "37": 10.32849, + "38": 10.17511, + "39": 10.38336, + "40": 10.05674, + "41": 10.10841, + "42": 10.18865, + "43": 9.80582, + "44": 9.91887, + "45": 9.79924, + "46": 9.78948, + "47": 10.11342, + "48": 9.82499, + "49": 9.49844, + "50": 9.87311 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 27059.0, + "2": 29311.0, + "3": 28886.0, + "4": 27768.0, + "5": 32694.0, + "6": 33260.0, + "7": 31409.0, + "8": 27342.0, + "9": 30401.0, + "10": 25524.0, + "11": 33805.0, + "12": 31146.0, + "13": 33161.0, + "14": 33991.0, + "15": 31160.0, + "16": 32445.0, + "17": 30974.0, + "18": 31151.0, + "19": 31742.0, + "20": 28624.0, + "21": 29115.0, + "22": 26827.0, + "23": 34472.0, + "24": 29096.0, + "25": 27239.0, + "26": 30910.0, + "27": 31915.0, + "28": 33968.0, + "29": 36017.0, + "30": 30702.0, + "31": 27384.0, + "32": 33681.0, + "33": 35476.0, + "34": 30160.0, + "35": 31419.0, + "36": 32568.0, + "37": 36189.0, + "38": 33607.0, + "39": 37731.0, + "40": 34463.0, + "41": 33229.0, + "42": 35616.0, + "43": 32361.0, + "44": 31908.0, + "45": 33571.0, + "46": 33618.0, + "47": 38873.0, + "48": 35034.0, + "49": 34407.0, + "50": 37669.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1559412224.0, + "2": 1558616064.0, + "3": 1558495744.0, + "4": 1559267328.0, + "5": 1558842880.0, + "6": 1559098368.0, + "7": 1558495744.0, + "8": 1558546432.0, + "9": 1558495744.0, + "10": 1558546432.0, + "11": 1558597120.0, + "12": 1558546432.0, + "13": 1558597120.0, + "14": 1558546432.0, + "15": 1558904320.0, + "16": 1558647808.0, + "17": 1558597120.0, + "18": 1558889472.0, + "19": 1558597120.0, + "20": 1559229440.0, + "21": 1558597120.0, + "22": 1558758400.0, + "23": 1559698944.0, + "24": 1559078912.0, + "25": 1559052800.0, + "26": 1558647808.0, + "27": 1559382528.0, + "28": 1558749184.0, + "29": 1558830592.0, + "30": 1558749184.0, + "31": 1558915584.0, + "32": 1559541760.0, + "33": 1558698496.0, + "34": 1558749184.0, + "35": 1559422464.0, + "36": 1558863872.0, + "37": 1558799872.0, + "38": 1558749184.0, + "39": 1559397888.0, + "40": 1559002112.0, + "41": 1558799872.0, + "42": 1558850560.0, + "43": 1559724544.0, + "44": 1558850560.0, + "45": 1558901248.0, + "46": 1559175168.0, + "47": 1558901248.0, + "48": 1558850560.0, + "49": 1558901248.0, + "50": 1559632896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3495116800.0, + "2": 4054579712.0, + "3": 4062724096.0, + "4": 4062724096.0, + "5": 4062724096.0, + "6": 4062724096.0, + "7": 4070930432.0, + "8": 4070930432.0, + "9": 4073446400.0, + "10": 4073446400.0, + "11": 4073446400.0, + "12": 4073446400.0, + "13": 4073446400.0, + "14": 4075493888.0, + "15": 4075493888.0, + "16": 4075493888.0, + "17": 4075493888.0, + "18": 4075493888.0, + "19": 4075493888.0, + "20": 4075493888.0, + "21": 4075493888.0, + "22": 4079303168.0, + "23": 4096666624.0, + "24": 4096666624.0, + "25": 4096666624.0, + "26": 4096666624.0, + "27": 4096666624.0, + "28": 4096666624.0, + "29": 4096666624.0, + "30": 4096666624.0, + "31": 4105302016.0, + "32": 4105302016.0, + "33": 4105302016.0, + "34": 4105302016.0, + "35": 4105302016.0, + "36": 4105302016.0, + "37": 4105302016.0, + "38": 4105302016.0, + "39": 4105302016.0, + "40": 4105302016.0, + "41": 4105302016.0, + "42": 4105302016.0, + "43": 4105302016.0, + "44": 4105302016.0, + "45": 4105302016.0, + "46": 4105302016.0, + "47": 4105302016.0, + "48": 4105302016.0, + "49": 4105302016.0, + "50": 4105302016.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 6.04776, + "3": 0.43191, + "4": 0.39355, + "5": 0.39556, + "6": 0.39818, + "7": 0.39915, + "8": 0.39139, + "9": 0.41074, + "10": 0.45245, + "11": 0.45849, + "12": 0.46806, + "13": 0.46943, + "14": 0.47411, + "15": 0.48525, + "16": 0.47939, + "17": 0.47872, + "18": 0.4715, + "19": 0.4792, + "20": 0.46531, + "21": 0.46809, + "22": 0.46348, + "23": 0.47875, + "24": 0.83175, + "25": 0.50009, + "26": 0.4884, + "27": 0.82926, + "28": 0.50184, + "29": 0.50509, + "30": 0.49725, + "31": 0.50602, + "32": 0.84607, + "33": 0.50581, + "34": 0.49849, + "35": 0.50057, + "36": 0.5007, + "37": 0.50598, + "38": 0.50147, + "39": 0.51593, + "40": 0.51491, + "41": 0.50337, + "42": 0.48945, + "43": 0.49729, + "44": 0.49341, + "45": 0.4898, + "46": 0.49624, + "47": 0.51146, + "48": 0.49582, + "49": 0.49624, + "50": 0.49469 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..113a491b0ba --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81199, + "2": 10.82649, + "3": 10.81384, + "4": 10.79509, + "5": 10.83534, + "6": 10.84275, + "7": 10.83571, + "8": 10.83439, + "9": 10.83696, + "10": 10.78957, + "11": 10.85974, + "12": 10.84264, + "13": 10.84986, + "14": 10.86378, + "15": 10.80482, + "16": 10.79204, + "17": 10.7636, + "18": 10.78823, + "19": 10.78841, + "20": 10.70796, + "21": 10.68628, + "22": 10.53299, + "23": 10.691, + "24": 10.58061, + "25": 10.5289, + "26": 10.57723, + "27": 10.58971, + "28": 10.5643, + "29": 10.56693, + "30": 10.35124, + "31": 10.09414, + "32": 10.43287, + "33": 10.43231, + "34": 10.19673, + "35": 10.23457, + "36": 10.19059, + "37": 10.31658, + "38": 10.16469, + "39": 10.37482, + "40": 10.05031, + "41": 10.10005, + "42": 10.1774, + "43": 9.79407, + "44": 9.91934, + "45": 9.7932, + "46": 9.78104, + "47": 10.10607, + "48": 9.8118, + "49": 9.48096, + "50": 9.86752, + "51": 9.8069, + "52": 9.70296, + "53": 10.03508, + "54": 9.92052, + "55": 9.84588, + "56": 9.58072, + "57": 9.43445, + "58": 9.79856, + "59": 9.54419, + "60": 9.45288, + "61": 9.65801, + "62": 9.95366, + "63": 9.34015, + "64": 9.73433, + "65": 8.90213, + "66": 9.6667, + "67": 9.33687, + "68": 9.7563, + "69": 9.77598, + "70": 9.70281, + "71": 9.60206, + "72": 9.543, + "73": 9.4557, + "74": 8.87804, + "75": 9.37677, + "76": 9.03816, + "77": 10.03912, + "78": 9.69714, + "79": 9.35195, + "80": 9.37278, + "81": 9.45649, + "82": 9.6802, + "83": 9.27723, + "84": 9.39341, + "85": 9.58928, + "86": 9.05151, + "87": 9.57623, + "88": 9.72869, + "89": 9.57637, + "90": 9.80884, + "91": 9.30719, + "92": 9.33823, + "93": 9.05712, + "94": 8.80375, + "95": 9.5091, + "96": 9.50777, + "97": 9.27751, + "98": 9.65271, + "99": 8.87009, + "100": 9.38142 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 26865.0, + "2": 29306.0, + "3": 29361.0, + "4": 28339.0, + "5": 32501.0, + "6": 33051.0, + "7": 31429.0, + "8": 27274.0, + "9": 30849.0, + "10": 25253.0, + "11": 34123.0, + "12": 30710.0, + "13": 33513.0, + "14": 33611.0, + "15": 31132.0, + "16": 32283.0, + "17": 31523.0, + "18": 30937.0, + "19": 31324.0, + "20": 28686.0, + "21": 29644.0, + "22": 27366.0, + "23": 34392.0, + "24": 29052.0, + "25": 27947.0, + "26": 31335.0, + "27": 31669.0, + "28": 33909.0, + "29": 35204.0, + "30": 30468.0, + "31": 27904.0, + "32": 33358.0, + "33": 35896.0, + "34": 30365.0, + "35": 31692.0, + "36": 32966.0, + "37": 35992.0, + "38": 33308.0, + "39": 38061.0, + "40": 34579.0, + "41": 33534.0, + "42": 36447.0, + "43": 32600.0, + "44": 32178.0, + "45": 34034.0, + "46": 34910.0, + "47": 39009.0, + "48": 34943.0, + "49": 34977.0, + "50": 38519.0, + "51": 36877.0, + "52": 36443.0, + "53": 43145.0, + "54": 41676.0, + "55": 38684.0, + "56": 41454.0, + "57": 35771.0, + "58": 41538.0, + "59": 39697.0, + "60": 56137.0, + "61": 59394.0, + "62": 2137056.0, + "63": 36401.0, + "64": 50930.0, + "65": 43788.0, + "66": 2139459.0, + "67": 2137025.0, + "68": 2137005.0, + "69": 2139555.0, + "70": 2140268.0, + "71": 2138613.0, + "72": 2139093.0, + "73": 2141321.0, + "74": 2137048.0, + "75": 2136852.0, + "76": 2140757.0, + "77": 2140654.0, + "78": 2141929.0, + "79": 2142543.0, + "80": 2142157.0, + "81": 2145547.0, + "82": 2144670.0, + "83": 2140858.0, + "84": 2140984.0, + "85": 2145921.0, + "86": 149825.0, + "87": 2144700.0, + "88": 2142479.0, + "89": 2140988.0, + "90": 2144684.0, + "91": 2143848.0, + "92": 2142027.0, + "93": 2139531.0, + "94": 2145775.0, + "95": 2143141.0, + "96": 2146259.0, + "97": 2140268.0, + "98": 2143316.0, + "99": 2144369.0, + "100": 2143057.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 787591680.0, + "2": 787578880.0, + "3": 787593728.0, + "4": 787568128.0, + "5": 787563008.0, + "6": 787585536.0, + "7": 787578368.0, + "8": 787582976.0, + "9": 787581952.0, + "10": 787592192.0, + "11": 787569152.0, + "12": 787570688.0, + "13": 787579392.0, + "14": 787582976.0, + "15": 787565568.0, + "16": 787572224.0, + "17": 787566592.0, + "18": 787547648.0, + "19": 787566592.0, + "20": 787537408.0, + "21": 787540992.0, + "22": 787540480.0, + "23": 787548672.0, + "24": 787542016.0, + "25": 787534336.0, + "26": 787548672.0, + "27": 787509760.0, + "28": 787504640.0, + "29": 787499520.0, + "30": 787494912.0, + "31": 787510784.0, + "32": 787501056.0, + "33": 787482624.0, + "34": 787486208.0, + "35": 787483136.0, + "36": 787482624.0, + "37": 787460608.0, + "38": 787457536.0, + "39": 787461632.0, + "40": 787457536.0, + "41": 787466752.0, + "42": 787432448.0, + "43": 787450368.0, + "44": 787436032.0, + "45": 787411456.0, + "46": 787460608.0, + "47": 787412992.0, + "48": 787440128.0, + "49": 787409920.0, + "50": 787396096.0, + "51": 787388416.0, + "52": 787415040.0, + "53": 787377664.0, + "54": 787403264.0, + "55": 787375104.0, + "56": 787362304.0, + "57": 787405824.0, + "58": 787356160.0, + "59": 787378688.0, + "60": 787380224.0, + "61": 787337216.0, + "62": 787331584.0, + "63": 787368960.0, + "64": 787339264.0, + "65": 787403776.0, + "66": 787330048.0, + "67": 787337728.0, + "68": 787324416.0, + "69": 787335680.0, + "70": 787328512.0, + "71": 787331584.0, + "72": 787341312.0, + "73": 787353088.0, + "74": 787366400.0, + "75": 787342848.0, + "76": 787344384.0, + "77": 787345920.0, + "78": 787371520.0, + "79": 787366400.0, + "80": 787390464.0, + "81": 787385344.0, + "82": 787395584.0, + "83": 787403776.0, + "84": 787397632.0, + "85": 787398144.0, + "86": 787411968.0, + "87": 787389952.0, + "88": 787387904.0, + "89": 787400704.0, + "90": 787379712.0, + "91": 787401216.0, + "92": 787399168.0, + "93": 787391488.0, + "94": 787392000.0, + "95": 787398656.0, + "96": 787395584.0, + "97": 787403776.0, + "98": 787396608.0, + "99": 787406848.0, + "100": 787410432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2662647296.0, + "2": 2662647296.0, + "3": 2665052672.0, + "4": 2665052672.0, + "5": 2665052672.0, + "6": 2665052672.0, + "7": 2665052672.0, + "8": 2665052672.0, + "9": 2665052672.0, + "10": 2665052672.0, + "11": 2665052672.0, + "12": 2665052672.0, + "13": 2665052672.0, + "14": 2665052672.0, + "15": 2665052672.0, + "16": 2665052672.0, + "17": 2665052672.0, + "18": 2665052672.0, + "19": 2665052672.0, + "20": 2665052672.0, + "21": 2665052672.0, + "22": 2665052672.0, + "23": 2665052672.0, + "24": 2665052672.0, + "25": 2665052672.0, + "26": 2665052672.0, + "27": 2665052672.0, + "28": 2665052672.0, + "29": 2665052672.0, + "30": 2665052672.0, + "31": 2665052672.0, + "32": 2665052672.0, + "33": 2665052672.0, + "34": 2665052672.0, + "35": 2665052672.0, + "36": 2665052672.0, + "37": 2665052672.0, + "38": 2665052672.0, + "39": 2665052672.0, + "40": 2665052672.0, + "41": 2665052672.0, + "42": 2665052672.0, + "43": 2665052672.0, + "44": 2665052672.0, + "45": 2665052672.0, + "46": 2665052672.0, + "47": 2665052672.0, + "48": 2665052672.0, + "49": 2665052672.0, + "50": 2665052672.0, + "51": 2665052672.0, + "52": 2665052672.0, + "53": 2665052672.0, + "54": 2665052672.0, + "55": 2665052672.0, + "56": 2665052672.0, + "57": 2665052672.0, + "58": 2665052672.0, + "59": 2665052672.0, + "60": 2665052672.0, + "61": 2665052672.0, + "62": 2665052672.0, + "63": 2665052672.0, + "64": 2665052672.0, + "65": 2665052672.0, + "66": 2665052672.0, + "67": 2665052672.0, + "68": 2665052672.0, + "69": 2665052672.0, + "70": 2665052672.0, + "71": 2665052672.0, + "72": 2665052672.0, + "73": 2665052672.0, + "74": 2665052672.0, + "75": 2665052672.0, + "76": 2665052672.0, + "77": 2665052672.0, + "78": 2665052672.0, + "79": 2665052672.0, + "80": 2665052672.0, + "81": 2665052672.0, + "82": 2665052672.0, + "83": 2665052672.0, + "84": 2665052672.0, + "85": 2665052672.0, + "86": 2665052672.0, + "87": 2665052672.0, + "88": 2665052672.0, + "89": 2665052672.0, + "90": 2665052672.0, + "91": 2665052672.0, + "92": 2665052672.0, + "93": 2665052672.0, + "94": 2665052672.0, + "95": 2665052672.0, + "96": 2665052672.0, + "97": 2665052672.0, + "98": 2665052672.0, + "99": 2665052672.0, + "100": 2665052672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 4.78579, + "3": 0.53829, + "4": 0.5501, + "5": 0.52877, + "6": 0.53341, + "7": 0.53101, + "8": 0.52594, + "9": 0.52656, + "10": 0.52721, + "11": 0.51907, + "12": 0.52113, + "13": 0.52417, + "14": 0.52392, + "15": 0.53475, + "16": 0.52116, + "17": 0.52656, + "18": 0.52034, + "19": 0.52016, + "20": 0.52199, + "21": 0.53183, + "22": 0.53661, + "23": 0.54084, + "24": 0.52495, + "25": 0.53128, + "26": 0.52735, + "27": 0.54335, + "28": 0.52654, + "29": 0.53834, + "30": 0.53606, + "31": 0.53938, + "32": 0.53598, + "33": 0.53326, + "34": 0.54444, + "35": 0.53164, + "36": 0.5404, + "37": 0.54568, + "38": 0.54552, + "39": 0.5366, + "40": 0.54027, + "41": 0.53525, + "42": 0.55075, + "43": 0.53886, + "44": 0.53665, + "45": 0.55089, + "46": 0.5331, + "47": 0.54482, + "48": 0.53151, + "49": 0.53493, + "50": 0.53302, + "51": 0.52424, + "52": 0.52434, + "53": 0.51687, + "54": 0.52816, + "55": 0.53022, + "56": 0.53577, + "57": 0.53245, + "58": 0.53568, + "59": 0.54753, + "60": 0.53813, + "61": 0.53815, + "62": 0.5366, + "63": 0.54423, + "64": 0.5344, + "65": 0.53864, + "66": 0.54089, + "67": 0.53579, + "68": 0.54777, + "69": 0.54032, + "70": 0.54348, + "71": 0.5411, + "72": 0.54019, + "73": 0.53851, + "74": 0.54021, + "75": 0.53784, + "76": 0.53954, + "77": 0.54237, + "78": 0.53049, + "79": 0.57915, + "80": 0.57307, + "81": 0.56876, + "82": 0.56781, + "83": 0.56481, + "84": 0.55385, + "85": 0.56577, + "86": 0.569, + "87": 0.5621, + "88": 0.56698, + "89": 0.55835, + "90": 0.85395, + "91": 0.56888, + "92": 0.55621, + "93": 0.57143, + "94": 0.5584, + "95": 0.56204, + "96": 0.5656, + "97": 0.5491, + "98": 0.56348, + "99": 0.5607, + "100": 0.56258 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..bfea64b8438 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79193, + "2": 10.81245, + "3": 10.79181, + "4": 10.78209, + "5": 10.82295, + "6": 10.83309, + "7": 10.81351, + "8": 10.81215, + "9": 10.81457, + "10": 10.76068, + "11": 10.84185, + "12": 10.82404, + "13": 10.83895, + "14": 10.84433, + "15": 10.79974, + "16": 10.78654, + "17": 10.76789, + "18": 10.77495, + "19": 10.77669, + "20": 10.71893, + "21": 10.69691, + "22": 10.5691, + "23": 10.7131, + "24": 10.59975, + "25": 10.56123, + "26": 10.60735, + "27": 10.63093, + "28": 10.6064, + "29": 10.61213, + "30": 10.39823, + "31": 10.16422, + "32": 10.49019, + "33": 10.48385, + "34": 10.26645, + "35": 10.31743, + "36": 10.28264, + "37": 10.39002, + "38": 10.25116, + "39": 10.43811, + "40": 10.1403, + "41": 10.19191, + "42": 10.25886, + "43": 9.91588, + "44": 10.02837, + "45": 9.91815, + "46": 9.89353, + "47": 10.20144, + "48": 9.92509, + "49": 9.62973, + "50": 9.97857 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5192.0, + "2": 5510.0, + "3": 5508.0, + "4": 5240.0, + "5": 6136.0, + "6": 6180.0, + "7": 5549.0, + "8": 5242.0, + "9": 5717.0, + "10": 4818.0, + "11": 6299.0, + "12": 5746.0, + "13": 6110.0, + "14": 6165.0, + "15": 5683.0, + "16": 5805.0, + "17": 5758.0, + "18": 5546.0, + "19": 5787.0, + "20": 5231.0, + "21": 5741.0, + "22": 5126.0, + "23": 6019.0, + "24": 5410.0, + "25": 5100.0, + "26": 5630.0, + "27": 5627.0, + "28": 6146.0, + "29": 6174.0, + "30": 5570.0, + "31": 4768.0, + "32": 5926.0, + "33": 6348.0, + "34": 5389.0, + "35": 5856.0, + "36": 5741.0, + "37": 6611.0, + "38": 6262.0, + "39": 6971.0, + "40": 6094.0, + "41": 6227.0, + "42": 6622.0, + "43": 5761.0, + "44": 5929.0, + "45": 5769.0, + "46": 6141.0, + "47": 6909.0, + "48": 6650.0, + "49": 6100.0, + "50": 6753.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627716608.0, + "2": 627719168.0, + "3": 627717632.0, + "4": 627719680.0, + "5": 627717120.0, + "6": 627717120.0, + "7": 627719680.0, + "8": 627716608.0, + "9": 627718144.0, + "10": 627718144.0, + "11": 627717632.0, + "12": 627718144.0, + "13": 627719168.0, + "14": 627718144.0, + "15": 627722240.0, + "16": 627718144.0, + "17": 627720704.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627718144.0, + "21": 627718656.0, + "22": 627723264.0, + "23": 627720192.0, + "24": 627719680.0, + "25": 627718144.0, + "26": 627719168.0, + "27": 627719168.0, + "28": 627718144.0, + "29": 627718144.0, + "30": 627719168.0, + "31": 627719168.0, + "32": 627719168.0, + "33": 627717632.0, + "34": 627719680.0, + "35": 627721216.0, + "36": 627717120.0, + "37": 627719168.0, + "38": 627721216.0, + "39": 627719168.0, + "40": 627718656.0, + "41": 627718144.0, + "42": 627717632.0, + "43": 627717120.0, + "44": 627718656.0, + "45": 627717632.0, + "46": 627717120.0, + "47": 627719168.0, + "48": 627718144.0, + "49": 627716608.0, + "50": 627716096.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 879803392.0, + "2": 1114769920.0, + "3": 1114769920.0, + "4": 1116293632.0, + "5": 1116293632.0, + "6": 1116293632.0, + "7": 1116293632.0, + "8": 1116293632.0, + "9": 1116293632.0, + "10": 1116293632.0, + "11": 1116293632.0, + "12": 1116293632.0, + "13": 1116293632.0, + "14": 1116293632.0, + "15": 1116293632.0, + "16": 1116293632.0, + "17": 1116293632.0, + "18": 1116293632.0, + "19": 1116293632.0, + "20": 1116293632.0, + "21": 1116293632.0, + "22": 1116293632.0, + "23": 1116293632.0, + "24": 1116293632.0, + "25": 1116293632.0, + "26": 1116293632.0, + "27": 1116293632.0, + "28": 1116293632.0, + "29": 1116293632.0, + "30": 1116293632.0, + "31": 1116293632.0, + "32": 1116293632.0, + "33": 1116293632.0, + "34": 1116293632.0, + "35": 1116293632.0, + "36": 1116293632.0, + "37": 1116293632.0, + "38": 1116293632.0, + "39": 1116293632.0, + "40": 1116293632.0, + "41": 1116293632.0, + "42": 1116293632.0, + "43": 1116293632.0, + "44": 1116293632.0, + "45": 1116293632.0, + "46": 1116293632.0, + "47": 1116293632.0, + "48": 1116293632.0, + "49": 1116293632.0, + "50": 1116293632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 7.71846, + "3": 0.76188, + "4": 0.74577, + "5": 0.73403, + "6": 0.73193, + "7": 0.73107, + "8": 0.72199, + "9": 0.726, + "10": 0.71891, + "11": 0.72723, + "12": 0.71504, + "13": 0.71448, + "14": 0.71551, + "15": 0.71936, + "16": 0.71512, + "17": 0.73948, + "18": 0.83787, + "19": 0.94178, + "20": 0.98096, + "21": 0.71399, + "22": 0.87302, + "23": 0.71359, + "24": 0.7104, + "25": 0.70807, + "26": 0.71636, + "27": 0.70864, + "28": 0.72237, + "29": 0.7163, + "30": 0.7153, + "31": 0.71793, + "32": 0.70846, + "33": 0.7079, + "34": 0.71058, + "35": 0.71492, + "36": 0.72031, + "37": 0.71537, + "38": 0.70333, + "39": 0.70449, + "40": 0.71725, + "41": 0.72322, + "42": 0.7105, + "43": 0.70421, + "44": 0.70441, + "45": 0.70449, + "46": 0.7091, + "47": 0.70989, + "48": 0.70781, + "49": 0.71985, + "50": 0.70534 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..2bcdb30bc50 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79193, + "2": 10.81245, + "3": 10.79181, + "4": 10.78209, + "5": 10.82295, + "6": 10.83309, + "7": 10.81351, + "8": 10.81215, + "9": 10.81457, + "10": 10.76068, + "11": 10.84185, + "12": 10.82404, + "13": 10.83895, + "14": 10.84433, + "15": 10.79974, + "16": 10.78654, + "17": 10.76789, + "18": 10.77495, + "19": 10.77669, + "20": 10.71893, + "21": 10.69691, + "22": 10.5691, + "23": 10.7131, + "24": 10.59975, + "25": 10.56123, + "26": 10.60735, + "27": 10.63093, + "28": 10.6064, + "29": 10.61213, + "30": 10.39823, + "31": 10.16422, + "32": 10.49019, + "33": 10.48385, + "34": 10.26645, + "35": 10.31743, + "36": 10.28264, + "37": 10.39002, + "38": 10.25116, + "39": 10.43811, + "40": 10.1403, + "41": 10.19191, + "42": 10.25886, + "43": 9.91588, + "44": 10.02837, + "45": 9.91815, + "46": 9.89353, + "47": 10.20144, + "48": 9.92509, + "49": 9.62973, + "50": 9.97857 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5192.0, + "2": 5510.0, + "3": 5508.0, + "4": 5240.0, + "5": 6136.0, + "6": 6180.0, + "7": 5549.0, + "8": 5242.0, + "9": 5717.0, + "10": 4818.0, + "11": 6299.0, + "12": 5746.0, + "13": 6110.0, + "14": 6165.0, + "15": 5683.0, + "16": 5805.0, + "17": 5758.0, + "18": 5546.0, + "19": 5787.0, + "20": 5231.0, + "21": 5741.0, + "22": 5126.0, + "23": 6019.0, + "24": 5410.0, + "25": 5100.0, + "26": 5630.0, + "27": 5627.0, + "28": 6146.0, + "29": 6174.0, + "30": 5570.0, + "31": 4768.0, + "32": 5926.0, + "33": 6348.0, + "34": 5389.0, + "35": 5856.0, + "36": 5741.0, + "37": 6611.0, + "38": 6262.0, + "39": 6971.0, + "40": 6094.0, + "41": 6227.0, + "42": 6622.0, + "43": 5761.0, + "44": 5929.0, + "45": 5769.0, + "46": 6141.0, + "47": 6909.0, + "48": 6650.0, + "49": 6100.0, + "50": 6753.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627716608.0, + "2": 627719168.0, + "3": 627717632.0, + "4": 627719680.0, + "5": 627717120.0, + "6": 627717120.0, + "7": 627719680.0, + "8": 627716608.0, + "9": 627718144.0, + "10": 627718144.0, + "11": 627717632.0, + "12": 627718144.0, + "13": 627719168.0, + "14": 627718144.0, + "15": 627722240.0, + "16": 627718144.0, + "17": 627720704.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627718144.0, + "21": 627718656.0, + "22": 627723264.0, + "23": 627720192.0, + "24": 627719680.0, + "25": 627718144.0, + "26": 627719168.0, + "27": 627719168.0, + "28": 627718144.0, + "29": 627718144.0, + "30": 627719168.0, + "31": 627719168.0, + "32": 627719168.0, + "33": 627717632.0, + "34": 627719680.0, + "35": 627721216.0, + "36": 627717120.0, + "37": 627719168.0, + "38": 627721216.0, + "39": 627719168.0, + "40": 627718656.0, + "41": 627718144.0, + "42": 627717632.0, + "43": 627717120.0, + "44": 627718656.0, + "45": 627717632.0, + "46": 627717120.0, + "47": 627719168.0, + "48": 627718144.0, + "49": 627716608.0, + "50": 627716096.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 879803392.0, + "2": 1114769920.0, + "3": 1114769920.0, + "4": 1116293632.0, + "5": 1116293632.0, + "6": 1116293632.0, + "7": 1116293632.0, + "8": 1116293632.0, + "9": 1116293632.0, + "10": 1116293632.0, + "11": 1116293632.0, + "12": 1116293632.0, + "13": 1116293632.0, + "14": 1116293632.0, + "15": 1116293632.0, + "16": 1116293632.0, + "17": 1116293632.0, + "18": 1116293632.0, + "19": 1116293632.0, + "20": 1116293632.0, + "21": 1116293632.0, + "22": 1116293632.0, + "23": 1116293632.0, + "24": 1116293632.0, + "25": 1116293632.0, + "26": 1116293632.0, + "27": 1116293632.0, + "28": 1116293632.0, + "29": 1116293632.0, + "30": 1116293632.0, + "31": 1116293632.0, + "32": 1116293632.0, + "33": 1116293632.0, + "34": 1116293632.0, + "35": 1116293632.0, + "36": 1116293632.0, + "37": 1116293632.0, + "38": 1116293632.0, + "39": 1116293632.0, + "40": 1116293632.0, + "41": 1116293632.0, + "42": 1116293632.0, + "43": 1116293632.0, + "44": 1116293632.0, + "45": 1116293632.0, + "46": 1116293632.0, + "47": 1116293632.0, + "48": 1116293632.0, + "49": 1116293632.0, + "50": 1116293632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 7.52257, + "3": 0.74502, + "4": 0.74089, + "5": 0.73009, + "6": 0.73041, + "7": 0.73704, + "8": 0.71933, + "9": 0.72466, + "10": 1.0546, + "11": 0.71525, + "12": 0.71298, + "13": 0.71412, + "14": 0.71521, + "15": 0.71883, + "16": 0.71464, + "17": 0.72192, + "18": 1.32991, + "19": 0.92083, + "20": 0.72233, + "21": 0.71533, + "22": 0.7144, + "23": 0.71011, + "24": 0.71396, + "25": 0.70984, + "26": 0.7111, + "27": 0.71496, + "28": 0.71187, + "29": 0.71729, + "30": 0.72095, + "31": 0.71436, + "32": 0.70963, + "33": 0.71384, + "34": 0.71534, + "35": 0.7148, + "36": 0.71389, + "37": 0.71097, + "38": 0.71244, + "39": 0.7048, + "40": 0.715, + "41": 1.08196, + "42": 0.71129, + "43": 0.73716, + "44": 0.72639, + "45": 0.71182, + "46": 0.71576, + "47": 0.72917, + "48": 0.72017, + "49": 0.72166, + "50": 0.70656 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..9a3140994d3 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80801, + "2": 10.80912, + "3": 10.81804, + "4": 10.77924, + "5": 10.83867, + "6": 10.8473, + "7": 10.80366, + "8": 10.81171, + "9": 10.80948, + "10": 10.77389, + "11": 10.85853, + "12": 10.83206, + "13": 10.84778, + "14": 10.86351, + "15": 10.77822, + "16": 10.78665, + "17": 10.74969, + "18": 10.78174, + "19": 10.77893, + "20": 10.71133, + "21": 10.68188, + "22": 10.53221, + "23": 10.70751, + "24": 10.58301, + "25": 10.53686, + "26": 10.59662, + "27": 10.62332, + "28": 10.58807, + "29": 10.61089, + "30": 10.39372, + "31": 10.1118, + "32": 10.4835, + "33": 10.48693, + "34": 10.23859, + "35": 10.29466, + "36": 10.25749, + "37": 10.38723, + "38": 10.24326, + "39": 10.43603, + "40": 10.12881, + "41": 10.18559, + "42": 10.25677, + "43": 9.8808, + "44": 10.00863, + "45": 9.89409, + "46": 9.85423, + "47": 10.1998, + "48": 9.90437, + "49": 9.58703, + "50": 9.96891 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5550.0, + "2": 5910.0, + "3": 5966.0, + "4": 5860.0, + "5": 6749.0, + "6": 6869.0, + "7": 6129.0, + "8": 5905.0, + "9": 6154.0, + "10": 5266.0, + "11": 6792.0, + "12": 6324.0, + "13": 6845.0, + "14": 6862.0, + "15": 6306.0, + "16": 6524.0, + "17": 6567.0, + "18": 6194.0, + "19": 6515.0, + "20": 5979.0, + "21": 6327.0, + "22": 5748.0, + "23": 6749.0, + "24": 5978.0, + "25": 5661.0, + "26": 6206.0, + "27": 6307.0, + "28": 7003.0, + "29": 7124.0, + "30": 6390.0, + "31": 5578.0, + "32": 6783.0, + "33": 7031.0, + "34": 6306.0, + "35": 6516.0, + "36": 6614.0, + "37": 7690.0, + "38": 7193.0, + "39": 7850.0, + "40": 7170.0, + "41": 6880.0, + "42": 7329.0, + "43": 6669.0, + "44": 6616.0, + "45": 6700.0, + "46": 7080.0, + "47": 7661.0, + "48": 7259.0, + "49": 7083.0, + "50": 7418.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458214400.0, + "3": 458210304.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458211840.0, + "7": 458213376.0, + "8": 458212352.0, + "9": 458213888.0, + "10": 458214400.0, + "11": 458212864.0, + "12": 458211840.0, + "13": 458214400.0, + "14": 458213888.0, + "15": 458214912.0, + "16": 458210816.0, + "17": 458213888.0, + "18": 458212864.0, + "19": 458214400.0, + "20": 458212352.0, + "21": 458214912.0, + "22": 458217472.0, + "23": 458213888.0, + "24": 458215424.0, + "25": 458212864.0, + "26": 458211328.0, + "27": 458213888.0, + "28": 458212864.0, + "29": 458213376.0, + "30": 458211840.0, + "31": 458214400.0, + "32": 458213888.0, + "33": 458213376.0, + "34": 458214400.0, + "35": 458213888.0, + "36": 458213888.0, + "37": 458212352.0, + "38": 458211328.0, + "39": 458212352.0, + "40": 458214912.0, + "41": 458212864.0, + "42": 458214912.0, + "43": 458215936.0, + "44": 458213376.0, + "45": 458212352.0, + "46": 458214400.0, + "47": 458214400.0, + "48": 458214400.0, + "49": 458212864.0, + "50": 458212352.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027753472.0, + "2": 1191298048.0, + "3": 1191298048.0, + "4": 1192127488.0, + "5": 1192127488.0, + "6": 1192127488.0, + "7": 1192127488.0, + "8": 1192127488.0, + "9": 1192127488.0, + "10": 1192127488.0, + "11": 1192546816.0, + "12": 1192546816.0, + "13": 1193283584.0, + "14": 1193283584.0, + "15": 1193283584.0, + "16": 1193283584.0, + "17": 1193283584.0, + "18": 1193283584.0, + "19": 1193283584.0, + "20": 1193283584.0, + "21": 1193283584.0, + "22": 1193556992.0, + "23": 1193556992.0, + "24": 1193556992.0, + "25": 1193556992.0, + "26": 1193556992.0, + "27": 1193556992.0, + "28": 1193556992.0, + "29": 1193556992.0, + "30": 1193556992.0, + "31": 1193556992.0, + "32": 1193556992.0, + "33": 1193556992.0, + "34": 1193556992.0, + "35": 1193556992.0, + "36": 1193556992.0, + "37": 1193556992.0, + "38": 1193556992.0, + "39": 1193556992.0, + "40": 1193556992.0, + "41": 1193556992.0, + "42": 1193556992.0, + "43": 1193556992.0, + "44": 1193556992.0, + "45": 1193556992.0, + "46": 1193556992.0, + "47": 1193556992.0, + "48": 1193556992.0, + "49": 1193556992.0, + "50": 1193556992.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 5.55037, + "3": 0.71803, + "4": 1.02899, + "5": 0.69274, + "6": 0.69497, + "7": 0.70712, + "8": 0.70476, + "9": 0.71137, + "10": 0.69978, + "11": 0.69201, + "12": 0.68876, + "13": 0.68954, + "14": 0.69496, + "15": 0.70166, + "16": 0.6946, + "17": 0.69269, + "18": 0.69041, + "19": 0.69219, + "20": 0.69191, + "21": 0.68931, + "22": 0.69642, + "23": 0.7087, + "24": 0.71205, + "25": 0.75075, + "26": 0.71466, + "27": 0.79945, + "28": 0.68459, + "29": 0.69018, + "30": 0.68977, + "31": 0.69421, + "32": 0.68991, + "33": 0.70331, + "34": 0.70581, + "35": 0.69718, + "36": 0.69748, + "37": 0.69248, + "38": 0.69828, + "39": 0.68816, + "40": 0.69315, + "41": 0.69476, + "42": 0.69711, + "43": 0.70588, + "44": 0.69538, + "45": 0.69598, + "46": 0.70429, + "47": 0.69137, + "48": 0.69183, + "49": 0.70009, + "50": 0.69388 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..883b2c99518 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80801, + "2": 10.80912, + "3": 10.81804, + "4": 10.77924, + "5": 10.83867, + "6": 10.8473, + "7": 10.80366, + "8": 10.81171, + "9": 10.80948, + "10": 10.77389, + "11": 10.85853, + "12": 10.83206, + "13": 10.84778, + "14": 10.86351, + "15": 10.77822, + "16": 10.78665, + "17": 10.74969, + "18": 10.78174, + "19": 10.77893, + "20": 10.71133, + "21": 10.68188, + "22": 10.53221, + "23": 10.70751, + "24": 10.58301, + "25": 10.53686, + "26": 10.59662, + "27": 10.62332, + "28": 10.58807, + "29": 10.61089, + "30": 10.39372, + "31": 10.1118, + "32": 10.4835, + "33": 10.48693, + "34": 10.23859, + "35": 10.29466, + "36": 10.25749, + "37": 10.38723, + "38": 10.24326, + "39": 10.43603, + "40": 10.12881, + "41": 10.18559, + "42": 10.25677, + "43": 9.8808, + "44": 10.00863, + "45": 9.89409, + "46": 9.85423, + "47": 10.1998, + "48": 9.90437, + "49": 9.58703, + "50": 9.96891 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5550.0, + "2": 5910.0, + "3": 5966.0, + "4": 5860.0, + "5": 6749.0, + "6": 6869.0, + "7": 6129.0, + "8": 5905.0, + "9": 6154.0, + "10": 5266.0, + "11": 6792.0, + "12": 6324.0, + "13": 6845.0, + "14": 6862.0, + "15": 6306.0, + "16": 6524.0, + "17": 6567.0, + "18": 6194.0, + "19": 6515.0, + "20": 5979.0, + "21": 6327.0, + "22": 5748.0, + "23": 6749.0, + "24": 5978.0, + "25": 5661.0, + "26": 6206.0, + "27": 6307.0, + "28": 7003.0, + "29": 7124.0, + "30": 6390.0, + "31": 5578.0, + "32": 6783.0, + "33": 7031.0, + "34": 6306.0, + "35": 6516.0, + "36": 6614.0, + "37": 7690.0, + "38": 7193.0, + "39": 7850.0, + "40": 7170.0, + "41": 6880.0, + "42": 7329.0, + "43": 6669.0, + "44": 6616.0, + "45": 6700.0, + "46": 7080.0, + "47": 7661.0, + "48": 7259.0, + "49": 7083.0, + "50": 7418.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458214400.0, + "3": 458210304.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458211840.0, + "7": 458213376.0, + "8": 458212352.0, + "9": 458213888.0, + "10": 458214400.0, + "11": 458212864.0, + "12": 458211840.0, + "13": 458214400.0, + "14": 458213888.0, + "15": 458214912.0, + "16": 458210816.0, + "17": 458213888.0, + "18": 458212864.0, + "19": 458214400.0, + "20": 458212352.0, + "21": 458214912.0, + "22": 458217472.0, + "23": 458213888.0, + "24": 458215424.0, + "25": 458212864.0, + "26": 458211328.0, + "27": 458213888.0, + "28": 458212864.0, + "29": 458213376.0, + "30": 458211840.0, + "31": 458214400.0, + "32": 458213888.0, + "33": 458213376.0, + "34": 458214400.0, + "35": 458213888.0, + "36": 458213888.0, + "37": 458212352.0, + "38": 458211328.0, + "39": 458212352.0, + "40": 458214912.0, + "41": 458212864.0, + "42": 458214912.0, + "43": 458215936.0, + "44": 458213376.0, + "45": 458212352.0, + "46": 458214400.0, + "47": 458214400.0, + "48": 458214400.0, + "49": 458212864.0, + "50": 458212352.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027753472.0, + "2": 1191298048.0, + "3": 1191298048.0, + "4": 1192127488.0, + "5": 1192127488.0, + "6": 1192127488.0, + "7": 1192127488.0, + "8": 1192127488.0, + "9": 1192127488.0, + "10": 1192127488.0, + "11": 1192546816.0, + "12": 1192546816.0, + "13": 1193283584.0, + "14": 1193283584.0, + "15": 1193283584.0, + "16": 1193283584.0, + "17": 1193283584.0, + "18": 1193283584.0, + "19": 1193283584.0, + "20": 1193283584.0, + "21": 1193283584.0, + "22": 1193556992.0, + "23": 1193556992.0, + "24": 1193556992.0, + "25": 1193556992.0, + "26": 1193556992.0, + "27": 1193556992.0, + "28": 1193556992.0, + "29": 1193556992.0, + "30": 1193556992.0, + "31": 1193556992.0, + "32": 1193556992.0, + "33": 1193556992.0, + "34": 1193556992.0, + "35": 1193556992.0, + "36": 1193556992.0, + "37": 1193556992.0, + "38": 1193556992.0, + "39": 1193556992.0, + "40": 1193556992.0, + "41": 1193556992.0, + "42": 1193556992.0, + "43": 1193556992.0, + "44": 1193556992.0, + "45": 1193556992.0, + "46": 1193556992.0, + "47": 1193556992.0, + "48": 1193556992.0, + "49": 1193556992.0, + "50": 1193556992.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 5.57426, + "3": 0.69219, + "4": 0.73624, + "5": 0.67766, + "6": 0.68248, + "7": 0.69364, + "8": 0.67336, + "9": 0.67768, + "10": 0.64366, + "11": 0.62986, + "12": 0.62576, + "13": 0.618, + "14": 0.61177, + "15": 0.61656, + "16": 0.61633, + "17": 0.61648, + "18": 0.62197, + "19": 0.61422, + "20": 0.61923, + "21": 0.61598, + "22": 0.62583, + "23": 0.62054, + "24": 0.61791, + "25": 0.62065, + "26": 0.61387, + "27": 0.61437, + "28": 0.61372, + "29": 0.65198, + "30": 0.66353, + "31": 0.68179, + "32": 0.67222, + "33": 0.67462, + "34": 0.68277, + "35": 0.66769, + "36": 0.66387, + "37": 0.66002, + "38": 0.63341, + "39": 0.62396, + "40": 0.62802, + "41": 0.62419, + "42": 0.61655, + "43": 0.62362, + "44": 0.61679, + "45": 0.61772, + "46": 0.62253, + "47": 0.61779, + "48": 0.61712, + "49": 0.63575, + "50": 0.67932 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..3e910ef7869 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79574, + "2": 10.81485, + "3": 10.78713, + "4": 10.78269, + "5": 10.82015, + "6": 10.83331, + "7": 10.81116, + "8": 10.81446, + "9": 10.81645, + "10": 10.75997, + "11": 10.8388, + "12": 10.81544, + "13": 10.84141, + "14": 10.8476, + "15": 10.79857, + "16": 10.78544, + "17": 10.77004, + "18": 10.77906, + "19": 10.7689, + "20": 10.71392, + "21": 10.69182, + "22": 10.56438, + "23": 10.70939, + "24": 10.60304, + "25": 10.55748, + "26": 10.60238, + "27": 10.62835, + "28": 10.59772, + "29": 10.61013, + "30": 10.40394, + "31": 10.17092, + "32": 10.49069, + "33": 10.48436, + "34": 10.26719, + "35": 10.31532, + "36": 10.27654, + "37": 10.39353, + "38": 10.24536, + "39": 10.43863, + "40": 10.13998, + "41": 10.19151, + "42": 10.25868, + "43": 9.9191, + "44": 10.03026, + "45": 9.92187, + "46": 9.89763, + "47": 10.1946, + "48": 9.93001, + "49": 9.62787, + "50": 9.97966 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13025.0, + "2": 14911.0, + "3": 14651.0, + "4": 13760.0, + "5": 16297.0, + "6": 16032.0, + "7": 15521.0, + "8": 13170.0, + "9": 15403.0, + "10": 12605.0, + "11": 16803.0, + "12": 15289.0, + "13": 16415.0, + "14": 16182.0, + "15": 15127.0, + "16": 16135.0, + "17": 15282.0, + "18": 15280.0, + "19": 15379.0, + "20": 13642.0, + "21": 14281.0, + "22": 13476.0, + "23": 16892.0, + "24": 13920.0, + "25": 13236.0, + "26": 15256.0, + "27": 15454.0, + "28": 15973.0, + "29": 16892.0, + "30": 14103.0, + "31": 13113.0, + "32": 16067.0, + "33": 16788.0, + "34": 14559.0, + "35": 14974.0, + "36": 15798.0, + "37": 17569.0, + "38": 16172.0, + "39": 17774.0, + "40": 16088.0, + "41": 16616.0, + "42": 17149.0, + "43": 15487.0, + "44": 15110.0, + "45": 16499.0, + "46": 17407.0, + "47": 19502.0, + "48": 16568.0, + "49": 16613.0, + "50": 18892.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 625796096.0, + "2": 625850368.0, + "3": 625987072.0, + "4": 625831424.0, + "5": 625794048.0, + "6": 625789952.0, + "7": 625830912.0, + "8": 625794048.0, + "9": 625861120.0, + "10": 625806848.0, + "11": 625795584.0, + "12": 626022912.0, + "13": 625802240.0, + "14": 625853952.0, + "15": 625796608.0, + "16": 625793024.0, + "17": 625798144.0, + "18": 625802240.0, + "19": 625792000.0, + "20": 625793536.0, + "21": 626690048.0, + "22": 626176000.0, + "23": 626092032.0, + "24": 625794560.0, + "25": 626540544.0, + "26": 625934848.0, + "27": 625799168.0, + "28": 625801728.0, + "29": 625793536.0, + "30": 626191360.0, + "31": 626149376.0, + "32": 626774016.0, + "33": 625792512.0, + "34": 625793024.0, + "35": 625851904.0, + "36": 625809408.0, + "37": 625794048.0, + "38": 625827328.0, + "39": 625865216.0, + "40": 625831936.0, + "41": 626081280.0, + "42": 626046464.0, + "43": 625792000.0, + "44": 625792000.0, + "45": 626266112.0, + "46": 626042880.0, + "47": 625789440.0, + "48": 625905152.0, + "49": 625883648.0, + "50": 626099712.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1844879360.0, + "2": 2076399104.0, + "3": 2076399104.0, + "4": 2079827456.0, + "5": 2079827456.0, + "6": 2079827456.0, + "7": 2079827456.0, + "8": 2079827456.0, + "9": 2079827456.0, + "10": 2079827456.0, + "11": 2079827456.0, + "12": 2079827456.0, + "13": 2079827456.0, + "14": 2079827456.0, + "15": 2079827456.0, + "16": 2079827456.0, + "17": 2079827456.0, + "18": 2079827456.0, + "19": 2079827456.0, + "20": 2079827456.0, + "21": 2079827456.0, + "22": 2079827456.0, + "23": 2079827456.0, + "24": 2079827456.0, + "25": 2079827456.0, + "26": 2079827456.0, + "27": 2079827456.0, + "28": 2079827456.0, + "29": 2079827456.0, + "30": 2079827456.0, + "31": 2079827456.0, + "32": 2079827456.0, + "33": 2079827456.0, + "34": 2079827456.0, + "35": 2079827456.0, + "36": 2079827456.0, + "37": 2079827456.0, + "38": 2079827456.0, + "39": 2079827456.0, + "40": 2079827456.0, + "41": 2079827456.0, + "42": 2079827456.0, + "43": 2079827456.0, + "44": 2079827456.0, + "45": 2079827456.0, + "46": 2079827456.0, + "47": 2079827456.0, + "48": 2079827456.0, + "49": 2079827456.0, + "50": 2079827456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 5.62487, + "3": 0.39428, + "4": 0.3711, + "5": 0.36777, + "6": 0.36423, + "7": 0.35408, + "8": 0.35462, + "9": 0.35588, + "10": 0.35204, + "11": 0.35155, + "12": 0.35049, + "13": 0.35818, + "14": 0.35461, + "15": 0.36874, + "16": 0.367, + "17": 0.37423, + "18": 0.36926, + "19": 0.37139, + "20": 0.37109, + "21": 0.37066, + "22": 0.37237, + "23": 0.37636, + "24": 0.37618, + "25": 0.37461, + "26": 0.37622, + "27": 0.37576, + "28": 0.37551, + "29": 0.3765, + "30": 0.3787, + "31": 0.38695, + "32": 0.37235, + "33": 0.37931, + "34": 0.37817, + "35": 0.3749, + "36": 0.37829, + "37": 0.37774, + "38": 0.3755, + "39": 0.37889, + "40": 0.37688, + "41": 0.38007, + "42": 0.37324, + "43": 0.36948, + "44": 0.37523, + "45": 0.37464, + "46": 0.38496, + "47": 0.3737, + "48": 0.37892, + "49": 0.39066, + "50": 0.37612 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..d9441fb83aa --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.80397, + "2": 10.81064, + "3": 10.798, + "4": 10.78655, + "5": 10.8299, + "6": 10.85307, + "7": 10.80967, + "8": 10.8015, + "9": 10.82484, + "10": 10.78237, + "11": 10.83805, + "12": 10.84658, + "13": 10.86137, + "14": 10.86451, + "15": 10.83791, + "16": 10.83385, + "17": 10.81249, + "18": 10.84432, + "19": 10.83623, + "20": 10.8168, + "21": 10.83551, + "22": 10.76274, + "23": 10.85573, + "24": 10.8062, + "25": 10.80042, + "26": 10.8143, + "27": 10.82932, + "28": 10.8546, + "29": 10.86594, + "30": 10.79379, + "31": 10.74776, + "32": 10.84932, + "33": 10.83727, + "34": 10.80597, + "35": 10.80259, + "36": 10.79662, + "37": 10.82559, + "38": 10.79231, + "39": 10.84778, + "40": 10.77804, + "41": 10.79895, + "42": 10.81493, + "43": 10.74316, + "44": 10.76656, + "45": 10.76408, + "46": 10.7768, + "47": 10.79908, + "48": 10.77572, + "49": 10.72207, + "50": 10.78609, + "51": 10.78712, + "52": 10.7653, + "53": 10.81235, + "54": 10.79776, + "55": 10.8072, + "56": 10.7562, + "57": 10.71334, + "58": 10.78166, + "59": 10.75039, + "60": 10.72977, + "61": 10.76435, + "62": 10.81299, + "63": 10.69266, + "64": 10.76646, + "65": 10.62484, + "66": 10.75371, + "67": 10.69118, + "68": 10.77122, + "69": 10.76048, + "70": 10.76506, + "71": 10.73497, + "72": 10.72999, + "73": 10.71715, + "74": 10.57819, + "75": 10.68208, + "76": 10.6133, + "77": 10.80786, + "78": 10.73142, + "79": 10.66063, + "80": 10.68014, + "81": 10.69828, + "82": 10.72277, + "83": 10.64104, + "84": 10.66223, + "85": 10.70251, + "86": 10.57982, + "87": 10.69083, + "88": 10.73435, + "89": 10.67796, + "90": 10.74299, + "91": 10.62241, + "92": 10.64011, + "93": 10.56628, + "94": 10.49922, + "95": 10.65675, + "96": 10.65892, + "97": 10.57941, + "98": 10.67242, + "99": 10.47965, + "100": 10.59346 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1158.0, + "2": 1181.0, + "3": 1227.0, + "4": 1263.0, + "5": 1308.0, + "6": 1594.0, + "7": 1345.0, + "8": 1253.0, + "9": 1239.0, + "10": 1232.0, + "11": 1255.0, + "12": 1250.0, + "13": 1516.0, + "14": 1270.0, + "15": 1335.0, + "16": 1260.0, + "17": 1249.0, + "18": 1261.0, + "19": 1099.0, + "20": 1387.0, + "21": 1298.0, + "22": 1286.0, + "23": 1315.0, + "24": 1131.0, + "25": 1157.0, + "26": 1166.0, + "27": 1133.0, + "28": 1243.0, + "29": 1348.0, + "30": 1235.0, + "31": 1090.0, + "32": 1272.0, + "33": 1355.0, + "34": 1161.0, + "35": 1159.0, + "36": 1146.0, + "37": 1222.0, + "38": 1418.0, + "39": 1273.0, + "40": 1198.0, + "41": 1160.0, + "42": 1285.0, + "43": 1094.0, + "44": 1127.0, + "45": 1130.0, + "46": 1183.0, + "47": 1312.0, + "48": 1238.0, + "49": 1068.0, + "50": 1163.0, + "51": 1234.0, + "52": 1284.0, + "53": 1380.0, + "54": 1191.0, + "55": 1099.0, + "56": 1298.0, + "57": 1241.0, + "58": 1203.0, + "59": 1324.0, + "60": 1254.0, + "61": 1120.0, + "62": 1362.0, + "63": 1213.0, + "64": 1330.0, + "65": 1057.0, + "66": 1171.0, + "67": 1208.0, + "68": 1320.0, + "69": 1304.0, + "70": 1122.0, + "71": 1259.0, + "72": 1254.0, + "73": 1203.0, + "74": 1125.0, + "75": 1413.0, + "76": 1217.0, + "77": 1412.0, + "78": 1291.0, + "79": 1020.0, + "80": 1143.0, + "81": 1243.0, + "82": 1154.0, + "83": 1052.0, + "84": 1219.0, + "85": 1360.0, + "86": 1072.0, + "87": 1319.0, + "88": 1347.0, + "89": 1127.0, + "90": 1474.0, + "91": 1140.0, + "92": 1110.0, + "93": 924.0, + "94": 1062.0, + "95": 1147.0, + "96": 1128.0, + "97": 1099.0, + "98": 1191.0, + "99": 1071.0, + "100": 1214.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 993802240.0, + "2": 993814528.0, + "3": 993790464.0, + "4": 993820160.0, + "5": 993728512.0, + "6": 993758208.0, + "7": 993780224.0, + "8": 993795584.0, + "9": 993800704.0, + "10": 993771520.0, + "11": 993752064.0, + "12": 993737216.0, + "13": 993804800.0, + "14": 993734144.0, + "15": 993773056.0, + "16": 993763840.0, + "17": 993744384.0, + "18": 993743872.0, + "19": 993764864.0, + "20": 993719296.0, + "21": 993772544.0, + "22": 993796096.0, + "23": 993748992.0, + "24": 993819136.0, + "25": 993792512.0, + "26": 993776640.0, + "27": 993804800.0, + "28": 993701888.0, + "29": 993717248.0, + "30": 993766912.0, + "31": 993802240.0, + "32": 993705984.0, + "33": 993759744.0, + "34": 993780224.0, + "35": 993740288.0, + "36": 993748480.0, + "37": 993697280.0, + "38": 993763840.0, + "39": 993747456.0, + "40": 993781248.0, + "41": 993752064.0, + "42": 993757696.0, + "43": 993793024.0, + "44": 993833984.0, + "45": 993780736.0, + "46": 993798144.0, + "47": 993789440.0, + "48": 993793024.0, + "49": 993743360.0, + "50": 993754624.0, + "51": 993786368.0, + "52": 993749504.0, + "53": 993812992.0, + "54": 993750528.0, + "55": 993732608.0, + "56": 993777664.0, + "57": 993859584.0, + "58": 993849856.0, + "59": 993761792.0, + "60": 993774592.0, + "61": 993771520.0, + "62": 993786880.0, + "63": 993787904.0, + "64": 993761280.0, + "65": 993792000.0, + "66": 993746432.0, + "67": 993782784.0, + "68": 993783808.0, + "69": 993741824.0, + "70": 993747968.0, + "71": 993736192.0, + "72": 993762304.0, + "73": 993784832.0, + "74": 993809920.0, + "75": 993753088.0, + "76": 993797632.0, + "77": 993720832.0, + "78": 993729536.0, + "79": 993730560.0, + "80": 993763840.0, + "81": 993728000.0, + "82": 993711616.0, + "83": 993772544.0, + "84": 993782784.0, + "85": 993787392.0, + "86": 993804288.0, + "87": 993737728.0, + "88": 993731584.0, + "89": 993755136.0, + "90": 993742848.0, + "91": 993763840.0, + "92": 993774080.0, + "93": 993792000.0, + "94": 993779712.0, + "95": 993723904.0, + "96": 993714688.0, + "97": 993752064.0, + "98": 993708544.0, + "99": 993792000.0, + "100": 993812992.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3074847232.0, + "2": 3373968896.0, + "3": 3373968896.0, + "4": 3378071552.0, + "5": 3378071552.0, + "6": 3378071552.0, + "7": 3378071552.0, + "8": 3378071552.0, + "9": 3378071552.0, + "10": 3378071552.0, + "11": 3378071552.0, + "12": 3378071552.0, + "13": 3378071552.0, + "14": 3378071552.0, + "15": 3378071552.0, + "16": 3378071552.0, + "17": 3378071552.0, + "18": 3378071552.0, + "19": 3378071552.0, + "20": 3378071552.0, + "21": 3378071552.0, + "22": 3378071552.0, + "23": 3378071552.0, + "24": 3378894848.0, + "25": 3378894848.0, + "26": 3378894848.0, + "27": 3378894848.0, + "28": 3378894848.0, + "29": 3378894848.0, + "30": 3378894848.0, + "31": 3378894848.0, + "32": 3378894848.0, + "33": 3378894848.0, + "34": 3378894848.0, + "35": 3378894848.0, + "36": 3378894848.0, + "37": 3378894848.0, + "38": 3378894848.0, + "39": 3378894848.0, + "40": 3378894848.0, + "41": 3378894848.0, + "42": 3378894848.0, + "43": 3378894848.0, + "44": 3392753152.0, + "45": 3392753152.0, + "46": 3392753152.0, + "47": 3392753152.0, + "48": 3392753152.0, + "49": 3392753152.0, + "50": 3392753152.0, + "51": 3392753152.0, + "52": 3392753152.0, + "53": 3392753152.0, + "54": 3392753152.0, + "55": 3392753152.0, + "56": 3392753152.0, + "57": 3407671808.0, + "58": 3407671808.0, + "59": 3407671808.0, + "60": 3407671808.0, + "61": 3407671808.0, + "62": 3407671808.0, + "63": 3407671808.0, + "64": 3407671808.0, + "65": 3407671808.0, + "66": 3407671808.0, + "67": 3407671808.0, + "68": 3407671808.0, + "69": 3407671808.0, + "70": 3407671808.0, + "71": 3407671808.0, + "72": 3407671808.0, + "73": 3407671808.0, + "74": 3407671808.0, + "75": 3407671808.0, + "76": 3407671808.0, + "77": 3407671808.0, + "78": 3407671808.0, + "79": 3407671808.0, + "80": 3407671808.0, + "81": 3407671808.0, + "82": 3407671808.0, + "83": 3407671808.0, + "84": 3407671808.0, + "85": 3407671808.0, + "86": 3407671808.0, + "87": 3407671808.0, + "88": 3407671808.0, + "89": 3407671808.0, + "90": 3407671808.0, + "91": 3407671808.0, + "92": 3407671808.0, + "93": 3407671808.0, + "94": 3407671808.0, + "95": 3407671808.0, + "96": 3407671808.0, + "97": 3407671808.0, + "98": 3407671808.0, + "99": 3407671808.0, + "100": 3407671808.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 3.79579, + "3": 0.25267, + "4": 0.22623, + "5": 0.22831, + "6": 0.57471, + "7": 0.22109, + "8": 0.21732, + "9": 0.2127, + "10": 0.20981, + "11": 0.21771, + "12": 0.23313, + "13": 0.20775, + "14": 0.19946, + "15": 0.21125, + "16": 0.2099, + "17": 0.20543, + "18": 0.19972, + "19": 0.20265, + "20": 0.20005, + "21": 0.20188, + "22": 0.19675, + "23": 0.19822, + "24": 0.19828, + "25": 0.19827, + "26": 0.19789, + "27": 0.20238, + "28": 0.19366, + "29": 0.19297, + "30": 0.19521, + "31": 0.19886, + "32": 0.19176, + "33": 0.19628, + "34": 0.19156, + "35": 0.19683, + "36": 0.19061, + "37": 0.19031, + "38": 0.19383, + "39": 0.1966, + "40": 0.19152, + "41": 0.18691, + "42": 0.1917, + "43": 0.20258, + "44": 0.19552, + "45": 0.20203, + "46": 0.18769, + "47": 0.18872, + "48": 0.18493, + "49": 0.18884, + "50": 0.18824, + "51": 0.20579, + "52": 0.18818, + "53": 0.18827, + "54": 0.19298, + "55": 0.57299, + "56": 0.18813, + "57": 0.18557, + "58": 0.18597, + "59": 0.18577, + "60": 0.18756, + "61": 0.18972, + "62": 0.18872, + "63": 0.18937, + "64": 0.1888, + "65": 0.19262, + "66": 0.1879, + "67": 0.18498, + "68": 0.18535, + "69": 0.19492, + "70": 0.1923, + "71": 0.18822, + "72": 0.19191, + "73": 0.19457, + "74": 0.19765, + "75": 0.19091, + "76": 0.73064, + "77": 0.19543, + "78": 0.19034, + "79": 0.18715, + "80": 0.19339, + "81": 0.19135, + "82": 0.18703, + "83": 0.19082, + "84": 0.18783, + "85": 0.1926, + "86": 0.19556, + "87": 0.19127, + "88": 0.19028, + "89": 0.56083, + "90": 0.19223, + "91": 0.18622, + "92": 0.18536, + "93": 0.19063, + "94": 0.18804, + "95": 0.18711, + "96": 0.1883, + "97": 0.19006, + "98": 0.18897, + "99": 0.60361, + "100": 0.19278 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..35841a4eaa1 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.80397, + "2": 10.81064, + "3": 10.79794, + "4": 10.78736, + "5": 10.82943, + "6": 10.85276, + "7": 10.80881, + "8": 10.79936, + "9": 10.82143, + "10": 10.77719, + "11": 10.8324, + "12": 10.8353, + "13": 10.84919, + "14": 10.85282, + "15": 10.79747, + "16": 10.78882, + "17": 10.75342, + "18": 10.78784, + "19": 10.77563, + "20": 10.70852, + "21": 10.6913, + "22": 10.53886, + "23": 10.69977, + "24": 10.58969, + "25": 10.54168, + "26": 10.60115, + "27": 10.61421, + "28": 10.59031, + "29": 10.60749, + "30": 10.38665, + "31": 10.13235, + "32": 10.49407, + "33": 10.47454, + "34": 10.23691, + "35": 10.28682, + "36": 10.26871, + "37": 10.38314, + "38": 10.23703, + "39": 10.43401, + "40": 10.1257, + "41": 10.17238, + "42": 10.24995, + "43": 9.85773, + "44": 9.98944, + "45": 9.87376, + "46": 9.84256, + "47": 10.1623, + "48": 9.89144, + "49": 9.57738, + "50": 9.96171, + "51": 9.88785, + "52": 9.76989, + "53": 10.10483, + "54": 9.99665, + "55": 9.92216, + "56": 9.67616, + "57": 9.51879, + "58": 9.89053, + "59": 9.63068, + "60": 9.55149, + "61": 9.72264, + "62": 10.0414, + "63": 9.43971, + "64": 9.8184, + "65": 8.98176, + "66": 9.75925, + "67": 9.39746, + "68": 9.83254, + "69": 9.81649, + "70": 9.75965, + "71": 9.66402, + "72": 9.63516, + "73": 9.54388, + "74": 9.00071, + "75": 9.465, + "76": 9.13889, + "77": 10.09535, + "78": 9.75814, + "79": 9.41614, + "80": 9.44749, + "81": 9.5168, + "82": 9.73156, + "83": 9.36737, + "84": 9.45017, + "85": 9.65534, + "86": 9.10891, + "87": 9.62042, + "88": 9.79408, + "89": 9.64391, + "90": 9.85314, + "91": 9.39297, + "92": 9.39817, + "93": 9.13664, + "94": 8.86865, + "95": 9.55719, + "96": 9.56146, + "97": 9.33062, + "98": 9.69677, + "99": 8.93672, + "100": 9.43355 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1158.0, + "2": 1181.0, + "3": 1239.0, + "4": 1193.0, + "5": 1380.0, + "6": 1501.0, + "7": 1391.0, + "8": 1239.0, + "9": 1272.0, + "10": 1225.0, + "11": 1337.0, + "12": 1167.0, + "13": 1452.0, + "14": 1254.0, + "15": 1269.0, + "16": 1347.0, + "17": 1234.0, + "18": 1202.0, + "19": 1183.0, + "20": 1141.0, + "21": 1236.0, + "22": 982.0, + "23": 1234.0, + "24": 1135.0, + "25": 1073.0, + "26": 1087.0, + "27": 1008.0, + "28": 1166.0, + "29": 1127.0, + "30": 1094.0, + "31": 929.0, + "32": 1176.0, + "33": 1123.0, + "34": 1084.0, + "35": 1110.0, + "36": 1053.0, + "37": 1208.0, + "38": 1233.0, + "39": 1210.0, + "40": 1229.0, + "41": 1217.0, + "42": 1317.0, + "43": 1212.0, + "44": 1144.0, + "45": 1374.0, + "46": 1249.0, + "47": 1163.0, + "48": 1176.0, + "49": 1335.0, + "50": 1172.0, + "51": 1345.0, + "52": 1334.0, + "53": 1388.0, + "54": 1316.0, + "55": 1263.0, + "56": 1320.0, + "57": 1143.0, + "58": 1050337.0, + "59": 1629.0, + "60": 58535.0, + "61": 1389.0, + "62": 1050420.0, + "63": 1302.0, + "64": 77146.0, + "65": 1058670.0, + "66": 1404.0, + "67": 59730.0, + "68": 50392.0, + "69": 2394.0, + "70": 108916.0, + "71": 46411.0, + "72": 2099307.0, + "73": 98574.0, + "74": 1106325.0, + "75": 4199.0, + "76": 1157870.0, + "77": 2145771.0, + "78": 1106899.0, + "79": 57707.0, + "80": 1105606.0, + "81": 1059706.0, + "82": 96668.0, + "83": 2099291.0, + "84": 1106935.0, + "85": 2099569.0, + "86": 1097473.0, + "87": 2099736.0, + "88": 2154950.0, + "89": 1154961.0, + "90": 3148620.0, + "91": 1157803.0, + "92": 3148576.0, + "93": 3148494.0, + "94": 1160976.0, + "95": 3148697.0, + "96": 3148663.0, + "97": 2147830.0, + "98": 3148776.0, + "99": 2099539.0, + "100": 3148590.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 788258816.0, + "2": 788271104.0, + "3": 788248064.0, + "4": 788275200.0, + "5": 788183552.0, + "6": 788212736.0, + "7": 788235264.0, + "8": 788251648.0, + "9": 788255232.0, + "10": 788226560.0, + "11": 788207104.0, + "12": 788192256.0, + "13": 788256256.0, + "14": 788187136.0, + "15": 788221952.0, + "16": 788214272.0, + "17": 788195328.0, + "18": 788191744.0, + "19": 788210176.0, + "20": 788163072.0, + "21": 788207616.0, + "22": 788235264.0, + "23": 788186112.0, + "24": 788249088.0, + "25": 788213248.0, + "26": 788204032.0, + "27": 788216832.0, + "28": 788117504.0, + "29": 788125696.0, + "30": 788169216.0, + "31": 788190720.0, + "32": 788118016.0, + "33": 788142592.0, + "34": 788156928.0, + "35": 788133376.0, + "36": 788130816.0, + "37": 788082688.0, + "38": 788156416.0, + "39": 788091904.0, + "40": 788120576.0, + "41": 788122112.0, + "42": 788104192.0, + "43": 788170752.0, + "44": 788189696.0, + "45": 788140032.0, + "46": 788148224.0, + "47": 788086272.0, + "48": 788128768.0, + "49": 788078080.0, + "50": 788078592.0, + "51": 788088832.0, + "52": 788065280.0, + "53": 788091392.0, + "54": 788059648.0, + "55": 788009472.0, + "56": 788060672.0, + "57": 788129792.0, + "58": 788124672.0, + "59": 788038656.0, + "60": 788026368.0, + "61": 788004352.0, + "62": 788007936.0, + "63": 788059136.0, + "64": 787972096.0, + "65": 788070400.0, + "66": 788031488.0, + "67": 788046336.0, + "68": 788037632.0, + "69": 787970560.0, + "70": 787992064.0, + "71": 788023808.0, + "72": 788022272.0, + "73": 788058624.0, + "74": 788075520.0, + "75": 788037632.0, + "76": 788094976.0, + "77": 787966464.0, + "78": 787980288.0, + "79": 788018176.0, + "80": 788026880.0, + "81": 787994624.0, + "82": 787986944.0, + "83": 788061696.0, + "84": 787999744.0, + "85": 787995648.0, + "86": 788012544.0, + "87": 787939328.0, + "88": 787957760.0, + "89": 787977728.0, + "90": 787927552.0, + "91": 787998720.0, + "92": 788026368.0, + "93": 788039680.0, + "94": 788032512.0, + "95": 788007424.0, + "96": 787978240.0, + "97": 788036608.0, + "98": 787984384.0, + "99": 788088320.0, + "100": 788081664.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2992141824.0, + "2": 3170010624.0, + "3": 3170010624.0, + "4": 3172966400.0, + "5": 3172966400.0, + "6": 3172966400.0, + "7": 3172966400.0, + "8": 3172966400.0, + "9": 3172966400.0, + "10": 3172966400.0, + "11": 3172966400.0, + "12": 3172966400.0, + "13": 3172966400.0, + "14": 3172966400.0, + "15": 3172966400.0, + "16": 3172966400.0, + "17": 3172966400.0, + "18": 3172966400.0, + "19": 3172966400.0, + "20": 3172966400.0, + "21": 3172966400.0, + "22": 3172966400.0, + "23": 3172966400.0, + "24": 3172966400.0, + "25": 3172966400.0, + "26": 3172966400.0, + "27": 3172966400.0, + "28": 3172966400.0, + "29": 3172966400.0, + "30": 3172966400.0, + "31": 3172966400.0, + "32": 3172966400.0, + "33": 3172966400.0, + "34": 3172966400.0, + "35": 3172966400.0, + "36": 3172966400.0, + "37": 3172966400.0, + "38": 3172966400.0, + "39": 3172966400.0, + "40": 3172966400.0, + "41": 3172966400.0, + "42": 3172966400.0, + "43": 3172966400.0, + "44": 3172966400.0, + "45": 3172966400.0, + "46": 3172966400.0, + "47": 3172966400.0, + "48": 3172966400.0, + "49": 3172966400.0, + "50": 3172966400.0, + "51": 3172966400.0, + "52": 3172966400.0, + "53": 3172966400.0, + "54": 3172966400.0, + "55": 3172966400.0, + "56": 3172966400.0, + "57": 3172966400.0, + "58": 3172966400.0, + "59": 3172966400.0, + "60": 3172966400.0, + "61": 3172966400.0, + "62": 3172966400.0, + "63": 3172966400.0, + "64": 3172966400.0, + "65": 3172966400.0, + "66": 3172966400.0, + "67": 3172966400.0, + "68": 3172966400.0, + "69": 3172966400.0, + "70": 3172966400.0, + "71": 3172966400.0, + "72": 3172966400.0, + "73": 3172966400.0, + "74": 3172966400.0, + "75": 3172966400.0, + "76": 3172966400.0, + "77": 3172966400.0, + "78": 3172966400.0, + "79": 3172966400.0, + "80": 3172966400.0, + "81": 3172966400.0, + "82": 3172966400.0, + "83": 3172966400.0, + "84": 3172966400.0, + "85": 3172966400.0, + "86": 3172966400.0, + "87": 3172966400.0, + "88": 3172966400.0, + "89": 3172966400.0, + "90": 3172966400.0, + "91": 3172966400.0, + "92": 3172966400.0, + "93": 3172966400.0, + "94": 3172966400.0, + "95": 3172966400.0, + "96": 3172966400.0, + "97": 3172966400.0, + "98": 3172966400.0, + "99": 3172966400.0, + "100": 3172966400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 4.24581, + "3": 0.22984, + "4": 0.19901, + "5": 0.20013, + "6": 0.19557, + "7": 0.19148, + "8": 0.18346, + "9": 0.18886, + "10": 0.18315, + "11": 0.18065, + "12": 0.18158, + "13": 0.17863, + "14": 0.17669, + "15": 0.16949, + "16": 0.1813, + "17": 0.17239, + "18": 0.16839, + "19": 0.17197, + "20": 0.18078, + "21": 0.17159, + "22": 0.17182, + "23": 0.17032, + "24": 0.16416, + "25": 0.16192, + "26": 0.17125, + "27": 0.15863, + "28": 0.16459, + "29": 0.16007, + "30": 0.16611, + "31": 0.16404, + "32": 0.15794, + "33": 0.16011, + "34": 0.15482, + "35": 0.15384, + "36": 0.16644, + "37": 0.15786, + "38": 0.15755, + "39": 0.15631, + "40": 0.15691, + "41": 0.15405, + "42": 0.14989, + "43": 0.1609, + "44": 0.15219, + "45": 0.15611, + "46": 0.1513, + "47": 0.15678, + "48": 0.14912, + "49": 0.14848, + "50": 0.15182, + "51": 0.16313, + "52": 0.14839, + "53": 0.14122, + "54": 0.14422, + "55": 0.14712, + "56": 0.14693, + "57": 0.14795, + "58": 0.14977, + "59": 0.15359, + "60": 0.14668, + "61": 0.15225, + "62": 0.14521, + "63": 0.14464, + "64": 0.14763, + "65": 0.14539, + "66": 0.14694, + "67": 0.1474, + "68": 0.14336, + "69": 0.14734, + "70": 0.14392, + "71": 0.14814, + "72": 0.15208, + "73": 0.15012, + "74": 0.14617, + "75": 0.14599, + "76": 0.14537, + "77": 0.15575, + "78": 0.15044, + "79": 0.68569, + "80": 0.15145, + "81": 0.1455, + "82": 0.14662, + "83": 0.14886, + "84": 0.14582, + "85": 0.14802, + "86": 0.1466, + "87": 0.14632, + "88": 0.14515, + "89": 0.14994, + "90": 0.1445, + "91": 0.14773, + "92": 0.14812, + "93": 0.14796, + "94": 0.51237, + "95": 0.15138, + "96": 0.15025, + "97": 0.14525, + "98": 0.1449, + "99": 0.1508, + "100": 0.14531 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..fe8d3f78926 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.80397, + "2": 10.81064, + "3": 10.79812, + "4": 10.78677, + "5": 10.82981, + "6": 10.85273, + "7": 10.80976, + "8": 10.80152, + "9": 10.82476, + "10": 10.78235, + "11": 10.83837, + "12": 10.84645, + "13": 10.86121, + "14": 10.86494, + "15": 10.83809, + "16": 10.8346, + "17": 10.8121, + "18": 10.84437, + "19": 10.83592, + "20": 10.81732, + "21": 10.83519, + "22": 10.76256, + "23": 10.85511, + "24": 10.80666, + "25": 10.80025, + "26": 10.81426, + "27": 10.82995, + "28": 10.855, + "29": 10.86565, + "30": 10.79384, + "31": 10.74771, + "32": 10.84943, + "33": 10.83771, + "34": 10.80572, + "35": 10.80265, + "36": 10.79622, + "37": 10.82514, + "38": 10.79237, + "39": 10.84811, + "40": 10.77883, + "41": 10.79922, + "42": 10.81563, + "43": 10.74376, + "44": 10.76683, + "45": 10.76467, + "46": 10.77697, + "47": 10.79973, + "48": 10.77586, + "49": 10.72215, + "50": 10.78584, + "51": 10.78731, + "52": 10.7657, + "53": 10.81241, + "54": 10.79761, + "55": 10.80688, + "56": 10.75611, + "57": 10.71341, + "58": 10.78104, + "59": 10.7507, + "60": 10.72941, + "61": 10.76448, + "62": 10.8119, + "63": 10.69242, + "64": 10.76661, + "65": 10.62474, + "66": 10.75342, + "67": 10.69134, + "68": 10.77079, + "69": 10.76029, + "70": 10.76451, + "71": 10.73531, + "72": 10.72951, + "73": 10.7174, + "74": 10.57782, + "75": 10.68245, + "76": 10.61342, + "77": 10.80749, + "78": 10.7321, + "79": 10.66078, + "80": 10.68008, + "81": 10.69796, + "82": 10.72301, + "83": 10.6413, + "84": 10.6619, + "85": 10.70249, + "86": 10.58035, + "87": 10.69015, + "88": 10.73441, + "89": 10.67777, + "90": 10.74269, + "91": 10.62186, + "92": 10.63964, + "93": 10.56627, + "94": 10.49913, + "95": 10.65738, + "96": 10.65873, + "97": 10.57872, + "98": 10.6722, + "99": 10.4802, + "100": 10.59334 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1158.0, + "2": 1181.0, + "3": 1248.0, + "4": 1219.0, + "5": 1372.0, + "6": 1529.0, + "7": 1362.0, + "8": 1179.0, + "9": 1276.0, + "10": 1234.0, + "11": 1281.0, + "12": 1249.0, + "13": 1386.0, + "14": 1213.0, + "15": 1215.0, + "16": 1299.0, + "17": 1242.0, + "18": 1233.0, + "19": 1167.0, + "20": 1392.0, + "21": 1264.0, + "22": 1289.0, + "23": 1336.0, + "24": 1168.0, + "25": 1170.0, + "26": 1207.0, + "27": 1192.0, + "28": 1327.0, + "29": 1354.0, + "30": 1250.0, + "31": 1110.0, + "32": 1331.0, + "33": 1340.0, + "34": 1250.0, + "35": 1105.0, + "36": 1138.0, + "37": 1265.0, + "38": 1375.0, + "39": 1243.0, + "40": 1306.0, + "41": 1154.0, + "42": 1251.0, + "43": 1122.0, + "44": 1139.0, + "45": 1122.0, + "46": 1203.0, + "47": 1405.0, + "48": 1282.0, + "49": 1167.0, + "50": 1166.0, + "51": 1249.0, + "52": 1320.0, + "53": 1340.0, + "54": 1232.0, + "55": 1103.0, + "56": 1275.0, + "57": 1194.0, + "58": 1259.0, + "59": 1283.0, + "60": 1265.0, + "61": 1124.0, + "62": 1349.0, + "63": 1132.0, + "64": 1272.0, + "65": 1017.0, + "66": 1174.0, + "67": 1242.0, + "68": 1291.0, + "69": 1295.0, + "70": 1143.0, + "71": 1148.0, + "72": 1266.0, + "73": 1199.0, + "74": 1133.0, + "75": 1346.0, + "76": 1224.0, + "77": 1329.0, + "78": 1256.0, + "79": 997.0, + "80": 1093.0, + "81": 1204.0, + "82": 1213.0, + "83": 1128.0, + "84": 1228.0, + "85": 1316.0, + "86": 1101.0, + "87": 1278.0, + "88": 1286.0, + "89": 1163.0, + "90": 1415.0, + "91": 1248.0, + "92": 1137.0, + "93": 912.0, + "94": 985.0, + "95": 1097.0, + "96": 1087.0, + "97": 1098.0, + "98": 1170.0, + "99": 1047.0, + "100": 1205.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1095588352.0, + "2": 1095600640.0, + "3": 1095576576.0, + "4": 1095606272.0, + "5": 1095514624.0, + "6": 1095542272.0, + "7": 1095563776.0, + "8": 1095580160.0, + "9": 1095585792.0, + "10": 1095554048.0, + "11": 1095538176.0, + "12": 1095523328.0, + "13": 1095589888.0, + "14": 1095519744.0, + "15": 1095557120.0, + "16": 1095548928.0, + "17": 1095531008.0, + "18": 1095528448.0, + "19": 1095549440.0, + "20": 1095504384.0, + "21": 1095561728.0, + "22": 1095583232.0, + "23": 1095534592.0, + "24": 1095604736.0, + "25": 1095577088.0, + "26": 1095565824.0, + "27": 1095591424.0, + "28": 1095485952.0, + "29": 1095502848.0, + "30": 1095552512.0, + "31": 1095588352.0, + "32": 1095491072.0, + "33": 1095547392.0, + "34": 1095568384.0, + "35": 1095527424.0, + "36": 1095533568.0, + "37": 1095482880.0, + "38": 1095552000.0, + "39": 1095532544.0, + "40": 1095567360.0, + "41": 1095537152.0, + "42": 1095543296.0, + "43": 1095581184.0, + "44": 1095620096.0, + "45": 1095569408.0, + "46": 1095584768.0, + "47": 1095573504.0, + "48": 1095577088.0, + "49": 1095530496.0, + "50": 1095540736.0, + "51": 1095570944.0, + "52": 1095538176.0, + "53": 1095597568.0, + "54": 1095536640.0, + "55": 1095517184.0, + "56": 1095566848.0, + "57": 1095645696.0, + "58": 1095634944.0, + "59": 1095548928.0, + "60": 1095562752.0, + "61": 1095553536.0, + "62": 1095572480.0, + "63": 1095573504.0, + "64": 1095550464.0, + "65": 1095578112.0, + "66": 1095531008.0, + "67": 1095568896.0, + "68": 1095566848.0, + "69": 1095527424.0, + "70": 1095532032.0, + "71": 1095520768.0, + "72": 1095548928.0, + "73": 1095569920.0, + "74": 1095596032.0, + "75": 1095538688.0, + "76": 1095584768.0, + "77": 1095507968.0, + "78": 1095514624.0, + "79": 1095515648.0, + "80": 1095551488.0, + "81": 1095513600.0, + "82": 1095498240.0, + "83": 1095558656.0, + "84": 1095569408.0, + "85": 1095576064.0, + "86": 1095590400.0, + "87": 1095523840.0, + "88": 1095517696.0, + "89": 1095539712.0, + "90": 1095528960.0, + "91": 1095550976.0, + "92": 1095561216.0, + "93": 1095579136.0, + "94": 1095564288.0, + "95": 1095510528.0, + "96": 1095502336.0, + "97": 1095537152.0, + "98": 1095496192.0, + "99": 1095577600.0, + "100": 1095598592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3125957632.0, + "2": 3477050368.0, + "3": 3477050368.0, + "4": 3481636352.0, + "5": 3481636352.0, + "6": 3481636352.0, + "7": 3481636352.0, + "8": 3481636352.0, + "9": 3481636352.0, + "10": 3481636352.0, + "11": 3481636352.0, + "12": 3481636352.0, + "13": 3481636352.0, + "14": 3481636352.0, + "15": 3481636352.0, + "16": 3481636352.0, + "17": 3481636352.0, + "18": 3481636352.0, + "19": 3481636352.0, + "20": 3481636352.0, + "21": 3481636352.0, + "22": 3481636352.0, + "23": 3481636352.0, + "24": 3482527744.0, + "25": 3482527744.0, + "26": 3482527744.0, + "27": 3482527744.0, + "28": 3482527744.0, + "29": 3482527744.0, + "30": 3482527744.0, + "31": 3482527744.0, + "32": 3482527744.0, + "33": 3482527744.0, + "34": 3482527744.0, + "35": 3482527744.0, + "36": 3482527744.0, + "37": 3482527744.0, + "38": 3482527744.0, + "39": 3482527744.0, + "40": 3482527744.0, + "41": 3482527744.0, + "42": 3482527744.0, + "43": 3482527744.0, + "44": 3495770112.0, + "45": 3495770112.0, + "46": 3495770112.0, + "47": 3495770112.0, + "48": 3495770112.0, + "49": 3495770112.0, + "50": 3495770112.0, + "51": 3495770112.0, + "52": 3495770112.0, + "53": 3495770112.0, + "54": 3495770112.0, + "55": 3495770112.0, + "56": 3495770112.0, + "57": 3505988608.0, + "58": 3505988608.0, + "59": 3505988608.0, + "60": 3505988608.0, + "61": 3505988608.0, + "62": 3505988608.0, + "63": 3505988608.0, + "64": 3505988608.0, + "65": 3505988608.0, + "66": 3505988608.0, + "67": 3505988608.0, + "68": 3505988608.0, + "69": 3505988608.0, + "70": 3505988608.0, + "71": 3505988608.0, + "72": 3505988608.0, + "73": 3505988608.0, + "74": 3505988608.0, + "75": 3505988608.0, + "76": 3505988608.0, + "77": 3505988608.0, + "78": 3505988608.0, + "79": 3505988608.0, + "80": 3505988608.0, + "81": 3505988608.0, + "82": 3505988608.0, + "83": 3505988608.0, + "84": 3505988608.0, + "85": 3505988608.0, + "86": 3505988608.0, + "87": 3505988608.0, + "88": 3505988608.0, + "89": 3505988608.0, + "90": 3505988608.0, + "91": 3505988608.0, + "92": 3505988608.0, + "93": 3505988608.0, + "94": 3505988608.0, + "95": 3505988608.0, + "96": 3505988608.0, + "97": 3505988608.0, + "98": 3505988608.0, + "99": 3505988608.0, + "100": 3505988608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 4.71174, + "3": 0.47502, + "4": 0.44931, + "5": 0.44277, + "6": 0.44844, + "7": 0.45785, + "8": 0.44209, + "9": 0.43757, + "10": 0.42772, + "11": 0.44315, + "12": 0.42725, + "13": 0.42666, + "14": 0.41928, + "15": 0.42831, + "16": 0.42799, + "17": 0.42051, + "18": 0.41469, + "19": 0.41876, + "20": 0.41842, + "21": 0.43095, + "22": 0.41003, + "23": 0.41066, + "24": 0.41091, + "25": 0.40849, + "26": 0.4098, + "27": 0.41447, + "28": 0.4098, + "29": 0.40395, + "30": 0.41016, + "31": 0.41347, + "32": 0.40916, + "33": 0.41299, + "34": 0.40596, + "35": 0.40696, + "36": 0.40868, + "37": 0.40718, + "38": 0.40736, + "39": 0.40604, + "40": 0.40127, + "41": 0.4, + "42": 0.40197, + "43": 0.40902, + "44": 0.40712, + "45": 0.4098, + "46": 0.40168, + "47": 0.40487, + "48": 0.40622, + "49": 0.4089, + "50": 0.40406, + "51": 0.41118, + "52": 0.40412, + "53": 0.40027, + "54": 0.40192, + "55": 0.39782, + "56": 0.39731, + "57": 0.39836, + "58": 0.40128, + "59": 0.39958, + "60": 0.39863, + "61": 0.78712, + "62": 0.39887, + "63": 0.39967, + "64": 0.40024, + "65": 0.39891, + "66": 0.40058, + "67": 0.80982, + "68": 0.39889, + "69": 0.39895, + "70": 0.40201, + "71": 0.39871, + "72": 0.39819, + "73": 0.40638, + "74": 0.40241, + "75": 0.39867, + "76": 0.40192, + "77": 0.4032, + "78": 0.39871, + "79": 0.96252, + "80": 0.39811, + "81": 0.40176, + "82": 0.39856, + "83": 0.40217, + "84": 0.3966, + "85": 0.40212, + "86": 0.40144, + "87": 0.39779, + "88": 0.3989, + "89": 0.39982, + "90": 0.40291, + "91": 0.40052, + "92": 0.39772, + "93": 0.40147, + "94": 0.40072, + "95": 0.40007, + "96": 0.40232, + "97": 0.40777, + "98": 0.4002, + "99": 0.39995, + "100": 0.39879 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..64e256f8b57 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.02499, + "2": 11.05412, + "3": 10.03918, + "4": 9.80215, + "5": 13.60005, + "6": 8.54454, + "7": 9.77444, + "8": 8.35233, + "9": 7.88788, + "10": 7.14039, + "11": 9.06955, + "12": 9.20099, + "13": 8.15607, + "14": 8.36221, + "15": 8.43013, + "16": 8.48001, + "17": 8.52462, + "18": 7.90076, + "19": 8.35376, + "20": 7.90482, + "21": 8.17608, + "22": 7.55176, + "23": 8.27889, + "24": 7.65732, + "25": 8.43063, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 47165208.0, + "2": 46898016.0, + "3": 134483200.0, + "4": 98911144.0, + "5": 405652352.0, + "6": 510869088.0, + "7": 779007104.0, + "8": 538338816.0, + "9": 365348032.0, + "10": 620817088.0, + "11": 502895552.0, + "12": 572091776.0, + "13": 714972800.0, + "14": 748130816.0, + "15": 709938432.0, + "16": 691583488.0, + "17": 963168256.0, + "18": 953453952.0, + "19": 713408000.0, + "20": 919014656.0, + "21": 899637952.0, + "22": 688944512.0, + "23": 856034560.0, + "24": 858768064.0, + "25": 818025472.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5283603968.0, + "2": 5283808768.0, + "3": 5283504640.0, + "4": 5283707392.0, + "5": 5283910144.0, + "6": 5284112896.0, + "7": 5284315648.0, + "8": 5284518400.0, + "9": 5284721152.0, + "10": 5284923904.0, + "11": 5285126656.0, + "12": 5285329408.0, + "13": 5285532160.0, + "14": 5285734912.0, + "15": 5285937664.0, + "16": 5286140416.0, + "17": 5286343168.0, + "18": 5286545920.0, + "19": 5286748672.0, + "20": 5286951424.0, + "21": 5287154176.0, + "22": 5287356928.0, + "23": 5287559680.0, + "24": 5287762432.0, + "25": 5287965184.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5283606528.0, + "2": 8265524736.0, + "3": 8265524736.0, + "4": 8265524736.0, + "5": 8265524736.0, + "6": 8265524736.0, + "7": 8265524736.0, + "8": 8265524736.0, + "9": 8265524736.0, + "10": 8276976128.0, + "11": 8276976128.0, + "12": 8276976128.0, + "13": 8276976128.0, + "14": 8276976128.0, + "15": 8276976128.0, + "16": 8276976128.0, + "17": 8276976128.0, + "18": 8276976128.0, + "19": 8276976128.0, + "20": 8276976128.0, + "21": 8276976128.0, + "22": 8285769216.0, + "23": 8285769216.0, + "24": 8285769216.0, + "25": 8285769216.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.03308, + "2": 11.06518, + "3": 10.81759, + "4": 10.53818, + "5": 10.71366, + "6": 8.61304, + "7": 10.1377, + "8": 8.29237, + "9": 7.71086, + "10": 6.91516, + "11": 9.19783, + "12": 9.26769, + "13": 8.06484, + "14": 8.2784, + "15": 8.36908, + "16": 8.41495, + "17": 8.38655, + "18": 7.69044, + "19": 8.28621, + "20": 7.79896, + "21": 8.09324, + "22": 7.49223, + "23": 8.14261, + "24": 7.5863, + "25": 8.37107, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 25.92832, + "3": 2.8263, + "4": 1.73655, + "5": 2.05123, + "6": 2.39425, + "7": 2.15639, + "8": 0.92138, + "9": 0.94247, + "10": 1.64642, + "11": 0.96645, + "12": 0.91866, + "13": 0.94198, + "14": 0.9255, + "15": 0.95577, + "16": 0.92425, + "17": 0.94137, + "18": 0.93111, + "19": 0.89952, + "20": 0.90021, + "21": 0.91289, + "22": 0.93437, + "23": 0.96363, + "24": 1.00045, + "25": 0.96782, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..dc46db36c72 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.0146, + "2": 11.04991, + "3": 10.14357, + "4": 9.67761, + "5": 9.94473, + "6": 9.95632, + "7": 9.92367, + "8": 8.83165, + "9": 8.42103, + "10": 7.83364, + "11": 10.81778, + "12": 10.35014, + "13": 8.66833, + "14": 9.13058, + "15": 9.2484, + "16": 9.32149, + "17": 9.20232, + "18": 8.73719, + "19": 9.32726, + "20": 8.88552, + "21": 9.10111, + "22": 8.53259, + "23": 8.96918, + "24": 8.67428, + "25": 9.1617, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 47167824.0, + "2": 46900664.0, + "3": 96750344.0, + "4": 246765024.0, + "5": 817718912.0, + "6": 592653248.0, + "7": 1018015232.0, + "8": 657925824.0, + "9": 711406848.0, + "10": 576785472.0, + "11": 704207488.0, + "12": 619264576.0, + "13": 718118144.0, + "14": 656897024.0, + "15": 621830912.0, + "16": 729345984.0, + "17": 831063744.0, + "18": 1025804096.0, + "19": 832938368.0, + "20": 1003945088.0, + "21": 830430208.0, + "22": 846188736.0, + "23": 1035339456.0, + "24": 1003472384.0, + "25": 1019352320.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4313446912.0, + "2": 4313448448.0, + "3": 4313448448.0, + "4": 4313448448.0, + "5": 4313448448.0, + "6": 4313448448.0, + "7": 4313448448.0, + "8": 4313448448.0, + "9": 4313448448.0, + "10": 4313448448.0, + "11": 4313448448.0, + "12": 4313448448.0, + "13": 4313448448.0, + "14": 4313448448.0, + "15": 4313448448.0, + "16": 4313448448.0, + "17": 4313448448.0, + "18": 4313448448.0, + "19": 4313448448.0, + "20": 4313448448.0, + "21": 4313448448.0, + "22": 4313448448.0, + "23": 4313448448.0, + "24": 4313448448.0, + "25": 4313448448.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4313449472.0, + "2": 7058323968.0, + "3": 7093507072.0, + "4": 7103545856.0, + "5": 7103545856.0, + "6": 7103545856.0, + "7": 7103545856.0, + "8": 7103545856.0, + "9": 7103545856.0, + "10": 7103545856.0, + "11": 7105847296.0, + "12": 7107386368.0, + "13": 7107386368.0, + "14": 7107386368.0, + "15": 7107386368.0, + "16": 7107386368.0, + "17": 7107386368.0, + "18": 7107386368.0, + "19": 7107386368.0, + "20": 7107386368.0, + "21": 7107386368.0, + "22": 7108604416.0, + "23": 7108922368.0, + "24": 7109204992.0, + "25": 7109204992.0, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": "nan", + "2": 26.62047, + "3": 1.74298, + "4": 2.19744, + "5": 2.54081, + "6": 0.94207, + "7": 2.41564, + "8": 0.89613, + "9": 0.88788, + "10": 0.90363, + "11": 2.30063, + "12": 0.89466, + "13": 0.87273, + "14": 2.31557, + "15": 0.91663, + "16": 0.87731, + "17": 0.89596, + "18": 0.87486, + "19": 0.87795, + "20": 0.87855, + "21": 0.88064, + "22": 0.88881, + "23": 0.88358, + "24": 0.88347, + "25": 0.88411, + "26": "nan", + "27": "nan", + "28": "nan", + "29": "nan", + "30": "nan", + "31": "nan", + "32": "nan", + "33": "nan", + "34": "nan", + "35": "nan", + "36": "nan", + "37": "nan", + "38": "nan", + "39": "nan", + "40": "nan", + "41": "nan", + "42": "nan", + "43": "nan", + "44": "nan", + "45": "nan", + "46": "nan", + "47": "nan", + "48": "nan", + "49": "nan", + "50": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..a35a7574e59 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81233, + "2": 10.82416, + "3": 10.81841, + "4": 10.81357, + "5": 10.85116, + "6": 10.85502, + "7": 10.84363, + "8": 10.83621, + "9": 10.84178, + "10": 10.77391, + "11": 10.86217, + "12": 10.84672, + "13": 10.85692, + "14": 10.8614, + "15": 10.80709, + "16": 10.78544, + "17": 10.7701, + "18": 10.79072, + "19": 10.78529, + "20": 10.71496, + "21": 10.67362, + "22": 10.5386, + "23": 10.69608, + "24": 10.58118, + "25": 10.52212, + "26": 10.58665, + "27": 10.60344, + "28": 10.5676, + "29": 10.5868, + "30": 10.36177, + "31": 10.09661, + "32": 10.45911, + "33": 10.45926, + "34": 10.21524, + "35": 10.2617, + "36": 10.22327, + "37": 10.35631, + "38": 10.20637, + "39": 10.40825, + "40": 10.08881, + "41": 10.13871, + "42": 10.22236, + "43": 9.82978, + "44": 9.96931, + "45": 9.83925, + "46": 9.81008, + "47": 10.16408, + "48": 9.84608, + "49": 9.53674, + "50": 9.91754, + "51": 9.86341, + "52": 9.74862, + "53": 10.08034, + "54": 9.96286, + "55": 9.89221, + "56": 9.64295, + "57": 9.48196, + "58": 9.85327, + "59": 9.58985, + "60": 9.5157, + "61": 9.70142, + "62": 10.01153, + "63": 9.40557, + "64": 9.78559, + "65": 8.96047, + "66": 9.72678, + "67": 9.38244, + "68": 9.79903, + "69": 9.81114, + "70": 9.74788, + "71": 9.6452, + "72": 9.6027, + "73": 9.51692, + "74": 8.95583, + "75": 9.43449, + "76": 9.10005, + "77": 10.07816, + "78": 9.72912, + "79": 9.39357, + "80": 9.41584, + "81": 9.49174, + "82": 9.71087, + "83": 9.32591, + "84": 9.42272, + "85": 9.62054, + "86": 9.08096, + "87": 9.59797, + "88": 9.7551, + "89": 9.6096, + "90": 9.83264, + "91": 9.34163, + "92": 9.3578, + "93": 9.09025, + "94": 8.83205, + "95": 9.52868, + "96": 9.5278, + "97": 9.30277, + "98": 9.66393, + "99": 8.89773, + "100": 9.404 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5532.0, + "2": 5934.0, + "3": 5812.0, + "4": 5817.0, + "5": 6435.0, + "6": 6641.0, + "7": 5880.0, + "8": 5900.0, + "9": 6317.0, + "10": 5314.0, + "11": 6659.0, + "12": 6393.0, + "13": 6585.0, + "14": 6649.0, + "15": 6237.0, + "16": 6606.0, + "17": 6232.0, + "18": 6059.0, + "19": 6380.0, + "20": 5723.0, + "21": 6197.0, + "22": 5714.0, + "23": 6527.0, + "24": 5948.0, + "25": 5822.0, + "26": 6271.0, + "27": 6493.0, + "28": 6789.0, + "29": 6971.0, + "30": 6252.0, + "31": 5836.0, + "32": 6830.0, + "33": 7155.0, + "34": 6428.0, + "35": 6909.0, + "36": 6559.0, + "37": 7582.0, + "38": 7325.0, + "39": 8189.0, + "40": 7156.0, + "41": 7113.0, + "42": 7783.0, + "43": 7236.0, + "44": 6958.0, + "45": 7093.0, + "46": 7385.0, + "47": 7634.0, + "48": 7916.0, + "49": 7565.0, + "50": 7795.0, + "51": 7967.0, + "52": 7869.0, + "53": 9001.0, + "54": 8408.0, + "55": 7734.0, + "56": 8108.0, + "57": 7339.0, + "58": 8677.0, + "59": 8299.0, + "60": 7790.0, + "61": 8347.0, + "62": 8345.0, + "63": 7835.0, + "64": 8861.0, + "65": 8293.0, + "66": 9180.0, + "67": 8276.0, + "68": 8251.0, + "69": 8666.0, + "70": 9836.0, + "71": 9020.0, + "72": 8503.0, + "73": 8996.0, + "74": 6967.0, + "75": 7749.0, + "76": 8534.0, + "77": 10688.0, + "78": 48163.0, + "79": 9603.0, + "80": 9991.0, + "81": 55995.0, + "82": 9533.0, + "83": 65535.0, + "84": 9876.0, + "85": 15848.0, + "86": 8732.0, + "87": 10574.0, + "88": 12165.0, + "89": 9808.0, + "90": 9644.0, + "91": 8584.0, + "92": 9300.0, + "93": 8081.0, + "94": 9101.0, + "95": 9919.0, + "96": 9755.0, + "97": 11113.0, + "98": 10522.0, + "99": 8739.0, + "100": 9616.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 628064256.0, + "2": 628065280.0, + "3": 628065280.0, + "4": 628065280.0, + "5": 628065280.0, + "6": 628065280.0, + "7": 628065280.0, + "8": 628065280.0, + "9": 628065280.0, + "10": 628065280.0, + "11": 628065280.0, + "12": 628065280.0, + "13": 628065280.0, + "14": 628065280.0, + "15": 628065280.0, + "16": 628065280.0, + "17": 628065280.0, + "18": 628065280.0, + "19": 628065280.0, + "20": 628065280.0, + "21": 628065280.0, + "22": 628065280.0, + "23": 628065280.0, + "24": 628065280.0, + "25": 628065280.0, + "26": 628065280.0, + "27": 628065280.0, + "28": 628065280.0, + "29": 628065280.0, + "30": 628065280.0, + "31": 628065280.0, + "32": 628065280.0, + "33": 628065280.0, + "34": 628065280.0, + "35": 628065280.0, + "36": 628065280.0, + "37": 628065280.0, + "38": 628065280.0, + "39": 628065280.0, + "40": 628065280.0, + "41": 628065280.0, + "42": 628065280.0, + "43": 628065280.0, + "44": 628065280.0, + "45": 628065280.0, + "46": 628065280.0, + "47": 628065280.0, + "48": 628065280.0, + "49": 628065280.0, + "50": 628065280.0, + "51": 628065280.0, + "52": 628065280.0, + "53": 628065280.0, + "54": 628065280.0, + "55": 628065280.0, + "56": 628065280.0, + "57": 628065280.0, + "58": 628065280.0, + "59": 628065280.0, + "60": 628065280.0, + "61": 628065280.0, + "62": 628065280.0, + "63": 628065280.0, + "64": 628065280.0, + "65": 628065280.0, + "66": 628065280.0, + "67": 628065280.0, + "68": 628065280.0, + "69": 628065280.0, + "70": 628065280.0, + "71": 628065280.0, + "72": 628065280.0, + "73": 628065280.0, + "74": 628065280.0, + "75": 628065280.0, + "76": 628065280.0, + "77": 628065280.0, + "78": 628065280.0, + "79": 628065280.0, + "80": 628065280.0, + "81": 628065280.0, + "82": 628065280.0, + "83": 628065280.0, + "84": 628065280.0, + "85": 628065280.0, + "86": 628065280.0, + "87": 628065280.0, + "88": 628065280.0, + "89": 628065280.0, + "90": 628065280.0, + "91": 628065280.0, + "92": 628065280.0, + "93": 628065280.0, + "94": 628065280.0, + "95": 628065280.0, + "96": 628065280.0, + "97": 628065280.0, + "98": 628065280.0, + "99": 628065280.0, + "100": 628065280.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 982504960.0, + "2": 1156255744.0, + "3": 1156255744.0, + "4": 1156255744.0, + "5": 1156255744.0, + "6": 1156255744.0, + "7": 1156255744.0, + "8": 1156255744.0, + "9": 1156255744.0, + "10": 1156255744.0, + "11": 1156255744.0, + "12": 1156255744.0, + "13": 1156255744.0, + "14": 1156255744.0, + "15": 1156255744.0, + "16": 1156255744.0, + "17": 1156255744.0, + "18": 1156255744.0, + "19": 1156255744.0, + "20": 1156255744.0, + "21": 1156255744.0, + "22": 1156255744.0, + "23": 1156255744.0, + "24": 1156255744.0, + "25": 1156255744.0, + "26": 1156255744.0, + "27": 1157233664.0, + "28": 1157233664.0, + "29": 1157233664.0, + "30": 1157233664.0, + "31": 1157233664.0, + "32": 1157233664.0, + "33": 1157233664.0, + "34": 1157233664.0, + "35": 1157233664.0, + "36": 1157233664.0, + "37": 1157233664.0, + "38": 1157233664.0, + "39": 1157233664.0, + "40": 1157233664.0, + "41": 1158865408.0, + "42": 1158865408.0, + "43": 1158865408.0, + "44": 1158865408.0, + "45": 1158865408.0, + "46": 1158865408.0, + "47": 1158865408.0, + "48": 1158865408.0, + "49": 1158865408.0, + "50": 1158865408.0, + "51": 1158865408.0, + "52": 1158865408.0, + "53": 1158865408.0, + "54": 1158865408.0, + "55": 1159034368.0, + "56": 1159063040.0, + "57": 1159542784.0, + "58": 1159542784.0, + "59": 1159542784.0, + "60": 1159542784.0, + "61": 1165075456.0, + "62": 1165075456.0, + "63": 1165075456.0, + "64": 1165075456.0, + "65": 1165075456.0, + "66": 1165075456.0, + "67": 1165075456.0, + "68": 1165075456.0, + "69": 1165075456.0, + "70": 1165075456.0, + "71": 1165075456.0, + "72": 1165075456.0, + "73": 1165075456.0, + "74": 1165075456.0, + "75": 1165075456.0, + "76": 1166216192.0, + "77": 1166216192.0, + "78": 1166216192.0, + "79": 1166216192.0, + "80": 1166216192.0, + "81": 1166216192.0, + "82": 1166216192.0, + "83": 1166639104.0, + "84": 1166639104.0, + "85": 1166639104.0, + "86": 1166639104.0, + "87": 1166639104.0, + "88": 1166639104.0, + "89": 1166639104.0, + "90": 1166639104.0, + "91": 1166639104.0, + "92": 1166639104.0, + "93": 1166639104.0, + "94": 1166639104.0, + "95": 1166639104.0, + "96": 1166639104.0, + "97": 1166639104.0, + "98": 1166639104.0, + "99": 1166639104.0, + "100": 1166639104.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 7.77547, + "3": 1.01252, + "4": 1.00639, + "5": 0.9897, + "6": 0.99553, + "7": 0.99796, + "8": 1.00873, + "9": 0.99009, + "10": 0.99264, + "11": 0.98765, + "12": 0.99024, + "13": 0.98319, + "14": 0.98552, + "15": 0.99368, + "16": 0.98342, + "17": 0.97729, + "18": 0.97272, + "19": 0.97308, + "20": 0.96906, + "21": 0.9751, + "22": 0.97375, + "23": 0.97447, + "24": 0.98494, + "25": 0.9779, + "26": 1.30939, + "27": 0.9766, + "28": 0.9856, + "29": 0.99223, + "30": 1.27178, + "31": 0.98025, + "32": 1.22425, + "33": 1.27653, + "34": 0.99358, + "35": 1.00171, + "36": 1.25408, + "37": 1.60005, + "38": 1.00572, + "39": 0.98676, + "40": 0.97218, + "41": 1.30266, + "42": 1.29066, + "43": 0.99057, + "44": 0.98517, + "45": 0.97968, + "46": 0.97289, + "47": 0.98145, + "48": 0.9804, + "49": 0.98022, + "50": 0.97431, + "51": 0.97593, + "52": 0.97255, + "53": 0.97424, + "54": 0.97043, + "55": 0.96887, + "56": 0.97492, + "57": 0.97623, + "58": 0.97423, + "59": 0.98879, + "60": 0.97992, + "61": 0.97895, + "62": 0.98829, + "63": 0.98719, + "64": 0.98651, + "65": 0.97852, + "66": 0.98045, + "67": 0.97825, + "68": 0.9795, + "69": 0.97812, + "70": 0.96297, + "71": 0.96718, + "72": 0.98343, + "73": 0.978, + "74": 0.99341, + "75": 0.97768, + "76": 0.97508, + "77": 0.97891, + "78": 0.9739, + "79": 0.96825, + "80": 0.96595, + "81": 0.95551, + "82": 0.97223, + "83": 0.9633, + "84": 0.96539, + "85": 0.97065, + "86": 0.97198, + "87": 0.97978, + "88": 0.98268, + "89": 0.99894, + "90": 1.00246, + "91": 0.98763, + "92": 0.98552, + "93": 0.99698, + "94": 0.99827, + "95": 0.99936, + "96": 0.99295, + "97": 0.99144, + "98": 0.99227, + "99": 0.98859, + "100": 0.99158 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..8710366a4a2 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json @@ -0,0 +1,644 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.94839, + "2": 10.94024, + "3": 10.95902, + "4": 10.9592, + "5": 10.93942, + "6": 10.95284, + "7": 10.95227, + "8": 10.93987, + "9": 10.94518, + "10": 10.94146, + "11": 10.94366, + "12": 10.93351, + "13": 10.92937, + "14": 10.93117, + "15": 10.87714, + "16": 10.88218, + "17": 10.87388, + "18": 10.86829, + "19": 10.86292, + "20": 10.78627, + "21": 10.73278, + "22": 10.62202, + "23": 10.72355, + "24": 10.61784, + "25": 10.54739, + "26": 10.64163, + "27": 10.63354, + "28": 10.59007, + "29": 10.59937, + "30": 10.36921, + "31": 10.1175, + "32": 10.457, + "33": 10.45238, + "34": 10.18943, + "35": 10.24409, + "36": 10.20779, + "37": 10.32099, + "38": 10.17141, + "39": 10.39579, + "40": 10.03318, + "41": 10.08573, + "42": 10.17487, + "43": 9.7274, + "44": 9.88257, + "45": 9.73978, + "46": 9.72104, + "47": 10.08354, + "48": 9.75251, + "49": 9.39373, + "50": 9.83765, + "51": 9.76236, + "52": 9.65444, + "53": 10.01594, + "54": 9.86969, + "55": 9.79645, + "56": 9.53492, + "57": 9.365, + "58": 9.75243, + "59": 9.4751, + "60": 9.40362, + "61": 9.59124, + "62": 9.91012, + "63": 9.24082, + "64": 9.67728, + "65": 8.79731, + "66": 9.60544, + "67": 9.24212, + "68": 9.70475, + "69": 9.70741, + "70": 9.65988, + "71": 9.50626, + "72": 9.45834, + "73": 9.38692, + "74": 8.79461, + "75": 9.32175, + "76": 8.92857, + "77": 9.99456, + "78": 9.63104, + "79": 9.26692, + "80": 9.29144, + "81": 9.37768, + "82": 9.60984, + "83": 9.21108, + "84": 9.33667, + "85": 9.52726, + "86": 8.94539, + "87": 9.49937, + "88": 9.67766, + "89": 9.49525, + "90": 9.7509, + "91": 9.22918, + "92": 9.25394, + "93": 8.96194, + "94": 8.69021, + "95": 9.43531, + "96": 9.39886, + "97": 9.19199, + "98": 9.57248, + "99": 8.75688, + "100": 9.2924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22750392.0, + "2": 22953048.0, + "3": 22604332.0, + "4": 23266194.0, + "5": 22735592.0, + "6": 23061740.0, + "7": 22793278.0, + "8": 22960884.0, + "9": 22865532.0, + "10": 22950250.0, + "11": 22499592.0, + "12": 22455936.0, + "13": 22948014.0, + "14": 22384528.0, + "15": 22846172.0, + "16": 22856720.0, + "17": 22836352.0, + "18": 22590198.0, + "19": 22627006.0, + "20": 22712338.0, + "21": 22762590.0, + "22": 22816896.0, + "23": 22545280.0, + "24": 22794372.0, + "25": 22841964.0, + "26": 22549700.0, + "27": 22464724.0, + "28": 22453634.0, + "29": 22534620.0, + "30": 22636106.0, + "31": 22989484.0, + "32": 22593994.0, + "33": 22565948.0, + "34": 22855396.0, + "35": 22813606.0, + "36": 22595412.0, + "37": 22499234.0, + "38": 22926180.0, + "39": 22825052.0, + "40": 22675880.0, + "41": 22671624.0, + "42": 22682188.0, + "43": 23015228.0, + "44": 22766040.0, + "45": 22679588.0, + "46": 22915144.0, + "47": 22642744.0, + "48": 24003236.0, + "49": 23786618.0, + "50": 22931756.0, + "51": 23866290.0, + "52": 23807188.0, + "53": 24007482.0, + "54": 23916892.0, + "55": 23571308.0, + "56": 23954192.0, + "57": 24211600.0, + "58": 23914524.0, + "59": 23771900.0, + "60": 23813638.0, + "61": 23795512.0, + "62": 23739412.0, + "63": 23917700.0, + "64": 23895780.0, + "65": 24147262.0, + "66": 23794750.0, + "67": 23983810.0, + "68": 23674060.0, + "69": 23647770.0, + "70": 23907338.0, + "71": 23818256.0, + "72": 23723392.0, + "73": 22754048.0, + "74": 25181258.0, + "75": 24144968.0, + "76": 23976372.0, + "77": 22260516.0, + "78": 23862138.0, + "79": 23806872.0, + "80": 23773052.0, + "81": 25020468.0, + "82": 22812998.0, + "83": 23911992.0, + "84": 25144028.0, + "85": 22725432.0, + "86": 24205484.0, + "87": 24851672.0, + "88": 23700260.0, + "89": 22505492.0, + "90": 24062928.0, + "91": 22790310.0, + "92": 24923596.0, + "93": 23722104.0, + "94": 23993086.0, + "95": 24140048.0, + "96": 23909352.0, + "97": 23668280.0, + "98": 23832272.0, + "99": 23985032.0, + "100": 24101560.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 810140160.0, + "2": 804531200.0, + "3": 804531200.0, + "4": 934860800.0, + "5": 934860800.0, + "6": 934860800.0, + "7": 934860800.0, + "8": 934860800.0, + "9": 938611712.0, + "10": 938104832.0, + "11": 938379264.0, + "12": 934860800.0, + "13": 934860800.0, + "14": 934860800.0, + "15": 934860800.0, + "16": 942249984.0, + "17": 941443072.0, + "18": 937990144.0, + "19": 937548800.0, + "20": 937498624.0, + "21": 934860800.0, + "22": 934860800.0, + "23": 941533184.0, + "24": 942114816.0, + "25": 942398464.0, + "26": 934860800.0, + "27": 934860800.0, + "28": 934860800.0, + "29": 934860800.0, + "30": 934860800.0, + "31": 934860800.0, + "32": 934860800.0, + "33": 934860800.0, + "34": 941477888.0, + "35": 934860800.0, + "36": 934860800.0, + "37": 934860800.0, + "38": 934860800.0, + "39": 934860800.0, + "40": 934860800.0, + "41": 940742656.0, + "42": 940742656.0, + "43": 940742656.0, + "44": 940968960.0, + "45": 941581312.0, + "46": 934860800.0, + "47": 934860800.0, + "48": 940742656.0, + "49": 934860800.0, + "50": 934860800.0, + "51": 934860800.0, + "52": 940742656.0, + "53": 937498624.0, + "54": 937498624.0, + "55": 937498624.0, + "56": 937498624.0, + "57": 938199040.0, + "58": 937498624.0, + "59": 937498624.0, + "60": 940742656.0, + "61": 934860800.0, + "62": 934860800.0, + "63": 934860800.0, + "64": 934860800.0, + "65": 934860800.0, + "66": 934860800.0, + "67": 934860800.0, + "68": 934860800.0, + "69": 934860800.0, + "70": 934860800.0, + "71": 934860800.0, + "72": 934860800.0, + "73": 934860800.0, + "74": 934860800.0, + "75": 934860800.0, + "76": 934860800.0, + "77": 934860800.0, + "78": 934860800.0, + "79": 938199040.0, + "80": 938199040.0, + "81": 937498624.0, + "82": 938061824.0, + "83": 938412032.0, + "84": 937498624.0, + "85": 938199040.0, + "86": 938445824.0, + "87": 937498624.0, + "88": 937498624.0, + "89": 934860800.0, + "90": 934860800.0, + "91": 934860800.0, + "92": 940742656.0, + "93": 940742656.0, + "94": 938104832.0, + "95": 941451264.0, + "96": 940742656.0, + "97": 941542400.0, + "98": 938104832.0, + "99": 940742656.0, + "100": 938104832.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1017976320.0, + "2": 1226964480.0, + "3": 1228012032.0, + "4": 1300063744.0, + "5": 1300063744.0, + "6": 1300223488.0, + "7": 1300891648.0, + "8": 1300891648.0, + "9": 1300891648.0, + "10": 1303292416.0, + "11": 1303292416.0, + "12": 1303292416.0, + "13": 1303292416.0, + "14": 1303292416.0, + "15": 1303292416.0, + "16": 1303292416.0, + "17": 1303292416.0, + "18": 1303292416.0, + "19": 1303292416.0, + "20": 1303292416.0, + "21": 1303292416.0, + "22": 1303292416.0, + "23": 1303292416.0, + "24": 1303292416.0, + "25": 1303292416.0, + "26": 1303292416.0, + "27": 1303292416.0, + "28": 1303292416.0, + "29": 1303292416.0, + "30": 1303292416.0, + "31": 1303292416.0, + "32": 1303292416.0, + "33": 1303292416.0, + "34": 1303292416.0, + "35": 1303292416.0, + "36": 1303292416.0, + "37": 1303292416.0, + "38": 1303292416.0, + "39": 1303292416.0, + "40": 1303292416.0, + "41": 1303292416.0, + "42": 1303292416.0, + "43": 1303292416.0, + "44": 1303292416.0, + "45": 1303292416.0, + "46": 1303292416.0, + "47": 1303292416.0, + "48": 1303292416.0, + "49": 1303292416.0, + "50": 1303292416.0, + "51": 1303292416.0, + "52": 1303292416.0, + "53": 1303292416.0, + "54": 1303292416.0, + "55": 1303292416.0, + "56": 1303292416.0, + "57": 1303292416.0, + "58": 1303292416.0, + "59": 1303292416.0, + "60": 1303292416.0, + "61": 1303292416.0, + "62": 1303292416.0, + "63": 1303292416.0, + "64": 1303292416.0, + "65": 1303292416.0, + "66": 1303292416.0, + "67": 1303292416.0, + "68": 1303292416.0, + "69": 1303292416.0, + "70": 1303292416.0, + "71": 1303292416.0, + "72": 1303292416.0, + "73": 1303292416.0, + "74": 1303292416.0, + "75": 1303292416.0, + "76": 1303292416.0, + "77": 1303292416.0, + "78": 1303292416.0, + "79": 1303292416.0, + "80": 1303292416.0, + "81": 1303292416.0, + "82": 1303292416.0, + "83": 1303292416.0, + "84": 1303292416.0, + "85": 1303292416.0, + "86": 1303292416.0, + "87": 1303292416.0, + "88": 1303292416.0, + "89": 1303292416.0, + "90": 1303292416.0, + "91": 1303292416.0, + "92": 1303292416.0, + "93": 1303292416.0, + "94": 1303292416.0, + "95": 1303292416.0, + "96": 1303292416.0, + "97": 1303292416.0, + "98": 1303292416.0, + "99": 1303292416.0, + "100": 1303292416.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89507, + "2": 10.90521, + "3": 10.90879, + "4": 10.86406, + "5": 10.91765, + "6": 10.92332, + "7": 10.90072, + "8": 10.8906, + "9": 10.90544, + "10": 10.88636, + "11": 10.93328, + "12": 10.91582, + "13": 10.90917, + "14": 10.92294, + "15": 10.89802, + "16": 10.90337, + "17": 10.88446, + "18": 10.90526, + "19": 10.90011, + "20": 10.88775, + "21": 10.88103, + "22": 10.85514, + "23": 10.89267, + "24": 10.87352, + "25": 10.86182, + "26": 10.87152, + "27": 10.88847, + "28": 10.87872, + "29": 10.88744, + "30": 10.87297, + "31": 10.80177, + "32": 10.8732, + "33": 10.88219, + "34": 10.83823, + "35": 10.85291, + "36": 10.84901, + "37": 10.85873, + "38": 10.83148, + "39": 10.86289, + "40": 10.82147, + "41": 10.82913, + "42": 10.84798, + "43": 10.7908, + "44": 10.81431, + "45": 10.7879, + "46": 10.78018, + "47": 10.83142, + "48": 10.78854, + "49": 10.71024, + "50": 10.76861, + "51": 10.76331, + "52": 10.73945, + "53": 10.80126, + "54": 10.77704, + "55": 10.765, + "56": 10.71649, + "57": 10.67368, + "58": 10.75107, + "59": 10.69607, + "60": 10.66236, + "61": 10.69617, + "62": 10.77167, + "63": 10.6134, + "64": 10.70881, + "65": 10.49259, + "66": 10.66843, + "67": 10.58084, + "68": 10.68215, + "69": 10.68669, + "70": 10.67296, + "71": 10.64397, + "72": 10.60997, + "73": 10.56734, + "74": 10.38624, + "75": 10.53623, + "76": 10.40297, + "77": 10.75436, + "78": 10.62548, + "79": 10.47858, + "80": 10.47388, + "81": 10.5143, + "82": 10.58579, + "83": 10.43913, + "84": 10.45418, + "85": 10.55042, + "86": 10.27831, + "87": 10.51067, + "88": 10.60469, + "89": 10.5084, + "90": 10.60243, + "91": 10.38487, + "92": 10.38165, + "93": 10.23549, + "94": 10.07844, + "95": 10.42709, + "96": 10.44697, + "97": 10.31686, + "98": 10.4968, + "99": 10.04966, + "100": 10.32944 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": 19.93451, + "3": 2.31445, + "4": 5.28856, + "5": 1.09994, + "6": 1.09399, + "7": 1.09697, + "8": 1.09872, + "9": 1.17005, + "10": 1.10071, + "11": 1.0994, + "12": 1.08313, + "13": 1.09364, + "14": 1.09082, + "15": 1.09269, + "16": 1.08133, + "17": 1.08872, + "18": 1.09032, + "19": 1.10458, + "20": 1.10126, + "21": 1.09029, + "22": 1.19723, + "23": 1.36303, + "24": 1.39758, + "25": 1.40863, + "26": 1.40985, + "27": 1.40231, + "28": 1.42816, + "29": 1.37678, + "30": 1.40545, + "31": 1.40841, + "32": 1.40385, + "33": 1.39528, + "34": 1.4028, + "35": 1.41768, + "36": 1.40649, + "37": 1.41813, + "38": 1.40674, + "39": 1.38881, + "40": 1.40998, + "41": 1.37775, + "42": 1.39701, + "43": 1.3967, + "44": 1.40408, + "45": 1.40972, + "46": 1.41116, + "47": 1.40427, + "48": 1.38905, + "49": 1.42541, + "50": 1.40474, + "51": 1.40708, + "52": 1.37484, + "53": 1.38539, + "54": 1.39988, + "55": 1.39635, + "56": 1.41326, + "57": 1.40012, + "58": 1.40599, + "59": 1.41023, + "60": 1.4209, + "61": 1.41702, + "62": 1.40134, + "63": 1.40282, + "64": 1.40573, + "65": 1.41933, + "66": 1.40057, + "67": 1.41526, + "68": 1.40285, + "69": 1.41947, + "70": 1.37747, + "71": 1.41206, + "72": 1.39123, + "73": 1.42381, + "74": 1.40806, + "75": 1.40032, + "76": 1.41783, + "77": 1.39133, + "78": 1.41146, + "79": 1.42648, + "80": 1.40774, + "81": 1.40046, + "82": 1.39158, + "83": 1.4079, + "84": 1.40469, + "85": 1.39689, + "86": 1.41401, + "87": 1.40637, + "88": 1.40569, + "89": 1.45225, + "90": 1.39469, + "91": 1.39677, + "92": 1.39569, + "93": 1.38882, + "94": 1.40133, + "95": 1.41493, + "96": 1.40659, + "97": 1.39059, + "98": 1.40044, + "99": 1.41118, + "100": 1.39159 + } + } +} \ No newline at end of file diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index 480b2dca8ae..394bda30a01 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -39,14 +39,25 @@ def resolve_cluster_config(cluster: str) -> str: def flatten_products(workload_manifest: dotdict) -> dotdict: """Flattens a nested dict of products""" - workload_manifest.products = [ - dict(**dict(zip(inp.keys(), values)), **{"test_case": product["test_case"][0]}) - for product in (workload_manifest.products or []) - if "products" in product - for inp in product["products"] - for values in itertools.product(*inp.values()) - ] + flattened_products = [] + products = workload_manifest.products or [] + for product in products: + if "products" not in product: + continue + + test_case = product["test_case"][0] + for param_dict in product["products"]: + # Generate all combinations of parameter values + param_combinations = itertools.product(*param_dict.values()) + + for value_combination in param_combinations: + # Map parameter names to their values + flattened = dict(zip(param_dict.keys(), value_combination)) + flattened["test_case"] = test_case + flattened_products.append(flattened) + + workload_manifest.products = flattened_products return workload_manifest @@ -223,14 +234,13 @@ def load_workloads( workloads: List[dotdict] = [] build_workloads: List = [] - for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")): + for file in list(recipes_dir.glob("**/*.yaml")) + list(local_dir.glob("**/*.yaml")): workloads += load_and_flatten(config_path=str(file)) if file.stem.startswith("_build"): build_workloads.append(load_config(config_path=str(file))) if scope: workloads = filter_by_scope(workload_manifests=workloads, scope=scope) - if workloads and environment: workloads = filter_by_environment(workload_manifests=workloads, environment=environment) diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gb200/gpt.yaml similarity index 68% rename from tests/test_utils/recipes/gpt-gb200.yaml rename to tests/test_utils/recipes/gb200/gpt.yaml index 9c3786332c9..f387fbb9a13 100644 --- a/tests/test_utils/recipes/gpt-gb200.yaml +++ b/tests/test_utils/recipes/gb200/gpt.yaml @@ -37,7 +37,7 @@ spec: ls cd /opt/megatron-lm - NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') + NAME=$(echo {test_case}_{environment} | sed 's/dgx_gb200/dgx_a100/g') export GPUS_PER_NODE={gpus} ARGUMENTS=( @@ -64,6 +64,9 @@ spec: exit $exit_code products: + ####################################################################### + # Nightly tests: Run DEV unless something is flaky # + ####################################################################### - test_case: [gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] @@ -78,7 +81,6 @@ products: products: - environment: [dev] scope: [nightly] - platforms: [dgx_gb200] - test_case: [gpt3_mcore_tp1_pp4] products: - environment: [dev] @@ -88,7 +90,6 @@ products: products: - environment: [dev] scope: [nightly] - platforms: [dgx_gb200] - test_case: [gpt3_mcore_tp4_pp1_resume_torch] products: - environment: [dev] @@ -99,258 +100,324 @@ products: - environment: [dev] scope: [nightly] platforms: [dgx_gb200] + ####################################################################### + # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for # + # some very important tests. # + ####################################################################### - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [nightly] + scope: [mr, mr-github-broken] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic] + # products: + # - environment: [dev] + # scope: [mr] - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] # Hangs: #513 - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [nightly] - platforms: [dgx_gb200] - - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_gb200] - - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_gb200] - - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap] - products: - - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [nightly] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp1_gdn] + products: + - environment: [dev] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_mla] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist] products: - environment: [dev] - scope: [nightly] - platforms: [dgx_gb200] - - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader] - products: - - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_tp2_pp2_uninstall_te] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_7b_tp1_pp4_memory_speed] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_7b_tp4_pp1_memory_speed] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] + ####################################################################### + # Super important mr, mr-github tests that run for DEV per mr, mr-github # + ####################################################################### - test_case: [gpt3_mcore_reruns_persistent_1] products: - environment: [dev] - scope: [nightly] + scope: [mr, mr-github-broken] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [dev] - scope: [nightly] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] - scope: [nightly] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: - environment: [dev] - scope: [nightly] + scope: [mr] + platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_a100, dgx_gb200] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_b200] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_b200] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_b200] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_b200] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_b200] + - test_case: [gpt3_weekly_dgx_gb200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] + products: + - environment: [dev] + scope: [weekly] + platforms: [dgx_gb200] + # - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_gb200] + # - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] + # products: + # - environment: [dev] + # scope: [weekly] + # platforms: [dgx_gb200] + - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] + products: + - environment: [dev] + scope: [weekly] platforms: [dgx_gb200] diff --git a/tests/test_utils/recipes/gb200/moe.yaml b/tests/test_utils/recipes/gb200/moe.yaml new file mode 100644 index 00000000000..28ae2415aac --- /dev/null +++ b/tests/test_utils/recipes/gb200/moe.yaml @@ -0,0 +1,220 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: "{test_case}_{environment}_{platforms}" + model: moe + build: mcore-pyt-{environment} + nodes: 2 + gpus: 4 + n_repeat: 5 + platforms: dgx_gb200 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + ls + cd /opt/megatron-lm + + NAME=$(echo {test_case}_{environment} | sed 's/dgx_gb200/dgx_a100/g') + export GPUS_PER_NODE={gpus} + + ARGUMENTS=( + "DATA_PATH=/mnt/artifacts" + "DATA_CACHE_PATH=/lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID/cache/" + "OUTPUT_PATH={assets_dir}" + "TENSORBOARD_PATH={assets_dir}/tensorboard" + "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" + "TRAINING_SCRIPT_PATH=pretrain_gpt.py" + "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "N_REPEAT={n_repeat}" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" + "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" + ) + + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} + +products: + ####################################################################### + # Nightly tests: Run both DEV and LTS unless something is flaky # + ####################################################################### + - test_case: [gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last] + products: + - environment: [dev] + scope: [nightly] + platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts] + # products: # non-determinism: #478 + # - environment: [dev, lts] + # scope: [nightly] + ####################################################################### + # Weekly tests: Run both DEV and LTS unless something is flaky # + ####################################################################### + ####################################################################### + # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for # + # some very important tests. # + ####################################################################### + - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] # hang: #513 + - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] + products: + - environment: [dev] + scope: [mr-broken] + platforms: [dgx_gb200] # hang: #513 + # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] # hang: #513 + - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_gb200] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon] + products: + - environment: [dev] + scope: [mr, mr-github, mr-slim] + platforms: [dgx_gb200] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon] + products: + - environment: [dev] + scope: [mr, mr-github, mr-slim] + platforms: [dgx_gb200] + - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] + products: + - environment: [dev] + scope: [mr-broken] + platforms: [dgx_gb200] + ####################################################################### + # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github # + ####################################################################### + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_gb200] + ########################### + # Merge train tests # + ########################### + - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] + products: + - environment: [dev] + scope: [mr, mr-github, mr-github-slim] + platforms: [dgx_gb200] + - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] + products: + - environment: [dev] + scope: [mr-broken] + platforms: [dgx_gb200] diff --git a/tests/test_utils/recipes/gb200/unit-tests.yaml b/tests/test_utils/recipes/gb200/unit-tests.yaml new file mode 100644 index 00000000000..0e8cb72916b --- /dev/null +++ b/tests/test_utils/recipes/gb200/unit-tests.yaml @@ -0,0 +1,153 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: "{test_case}_{environment}_{platforms}_{tag}" + model: unit-tests + nodes: 2 + build: mcore-pyt-{environment} + gpus: 4 + platforms: dgx_gb200 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + + # Checkout backwards-ref + cd /opt + bash /opt/megatron-lm/.gitlab/scripts/fetch-legacy-suite.sh \ + --backwards-commit $MCORE_BACKWARDS_COMMIT \ + --repo $MCORE_REPO + + script: |- + ls + + TAG={tag} + ENVIRONMENT={environment} + BUCKET="{test_case}" + UNIT_TEST_REPEAT={n_repeat} + export GPUS_PER_NODE={gpus} + + if [[ "$TAG" == "latest" ]]; then + TEST_PATH="/opt/megatron-lm" + else + TEST_PATH="/opt/megatron-lm-legacy/" + fi + + + bash $TEST_PATH/tests/unit_tests/run_ci_test.sh \ + --tag $TAG \ + --environment $ENVIRONMENT \ + --bucket $BUCKET \ + --unit-test-repeat $UNIT_TEST_REPEAT \ + --log-dir {assets_dir}/logs/1/ + + ls -al + + cd $TEST_PATH + /opt/venv/bin/coverage xml + cp .coverage {assets_dir}/coverage_report + cp coverage.xml {assets_dir} + +products: + - test_case: [tests/unit_tests/test_model_configs.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/test_fp8_param.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/pipeline_parallel/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/models/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/data/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/test_optimizer.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/models/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/dist_checkpointing/models/test_moe_experts.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/transformer/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/transformer/moe/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/distributed/megatron_fsdp/**/*.py] + products: + - environment: [dev] + tag: [latest] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/**/*.py] + products: + - environment: [dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/h100/bert.yaml similarity index 100% rename from tests/test_utils/recipes/bert.yaml rename to tests/test_utils/recipes/h100/bert.yaml diff --git a/tests/test_utils/recipes/ckpt_converter.yaml b/tests/test_utils/recipes/h100/ckpt_converter.yaml similarity index 100% rename from tests/test_utils/recipes/ckpt_converter.yaml rename to tests/test_utils/recipes/h100/ckpt_converter.yaml diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference-cuda-graphs.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml rename to tests/test_utils/recipes/h100/gpt-dynamic-inference-cuda-graphs.yaml diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml rename to tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-dynamic-inference.yaml rename to tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/h100/gpt-grads.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-grads.yaml rename to tests/test_utils/recipes/h100/gpt-grads.yaml diff --git a/tests/test_utils/recipes/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-grpo.yaml rename to tests/test_utils/recipes/h100/gpt-grpo.yaml diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/h100/gpt-nemo.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-nemo.yaml rename to tests/test_utils/recipes/h100/gpt-nemo.yaml diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/h100/gpt-static-inference.yaml similarity index 100% rename from tests/test_utils/recipes/gpt-static-inference.yaml rename to tests/test_utils/recipes/h100/gpt-static-inference.yaml diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/h100/gpt.yaml similarity index 100% rename from tests/test_utils/recipes/gpt.yaml rename to tests/test_utils/recipes/h100/gpt.yaml diff --git a/tests/test_utils/recipes/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml similarity index 100% rename from tests/test_utils/recipes/mamba-dynamic-inference.yaml rename to tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/h100/mamba-static-inference.yaml similarity index 100% rename from tests/test_utils/recipes/mamba-static-inference.yaml rename to tests/test_utils/recipes/h100/mamba-static-inference.yaml diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/h100/mamba.yaml similarity index 100% rename from tests/test_utils/recipes/mamba.yaml rename to tests/test_utils/recipes/h100/mamba.yaml diff --git a/tests/test_utils/recipes/mimo.yaml b/tests/test_utils/recipes/h100/mimo.yaml similarity index 100% rename from tests/test_utils/recipes/mimo.yaml rename to tests/test_utils/recipes/h100/mimo.yaml diff --git a/tests/test_utils/recipes/module_performance.yaml b/tests/test_utils/recipes/h100/module_performance.yaml similarity index 100% rename from tests/test_utils/recipes/module_performance.yaml rename to tests/test_utils/recipes/h100/module_performance.yaml diff --git a/tests/test_utils/recipes/moe-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml similarity index 100% rename from tests/test_utils/recipes/moe-dynamic-inference-with-coordinator.yaml rename to tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml similarity index 100% rename from tests/test_utils/recipes/moe-dynamic-inference.yaml rename to tests/test_utils/recipes/h100/moe-dynamic-inference.yaml diff --git a/tests/test_utils/recipes/moe-grpo.yaml b/tests/test_utils/recipes/h100/moe-grpo.yaml similarity index 100% rename from tests/test_utils/recipes/moe-grpo.yaml rename to tests/test_utils/recipes/h100/moe-grpo.yaml diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/h100/moe-static-inference.yaml similarity index 100% rename from tests/test_utils/recipes/moe-static-inference.yaml rename to tests/test_utils/recipes/h100/moe-static-inference.yaml diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/h100/moe.yaml similarity index 100% rename from tests/test_utils/recipes/moe.yaml rename to tests/test_utils/recipes/h100/moe.yaml diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/h100/multimodal-llava.yaml similarity index 100% rename from tests/test_utils/recipes/multimodal-llava.yaml rename to tests/test_utils/recipes/h100/multimodal-llava.yaml diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/h100/t5.yaml similarity index 100% rename from tests/test_utils/recipes/t5.yaml rename to tests/test_utils/recipes/h100/t5.yaml diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/h100/unit-tests.yaml similarity index 100% rename from tests/test_utils/recipes/unit-tests.yaml rename to tests/test_utils/recipes/h100/unit-tests.yaml diff --git a/tests/unit_tests/find_test_cases.py b/tests/unit_tests/find_test_cases.py index 2e9f5515b7d..1445206cab5 100644 --- a/tests/unit_tests/find_test_cases.py +++ b/tests/unit_tests/find_test_cases.py @@ -50,7 +50,8 @@ def expand_pattern(pattern): def main(): BUCKET = sys.argv[1] - YAML_FILE = 'tests/test_utils/recipes/unit-tests.yaml' + GPU_TYPE = sys.argv[2] + YAML_FILE = f'tests/test_utils/recipes/{GPU_TYPE}/unit-tests.yaml' all_test_cases = get_test_cases(YAML_FILE) bucket_files = set(expand_pattern(BUCKET)) diff --git a/tests/unit_tests/run_ci_test.sh b/tests/unit_tests/run_ci_test.sh index 81dd3ae2a14..c65f197d2db 100755 --- a/tests/unit_tests/run_ci_test.sh +++ b/tests/unit_tests/run_ci_test.sh @@ -117,7 +117,7 @@ export BUCKET IGNORE_ARGS=() while IFS= read -r line; do [[ -n "$line" ]] && IGNORE_ARGS+=("$line") -done < <(python tests/unit_tests/find_test_cases.py "$BUCKET") +done < <(python tests/unit_tests/find_test_cases.py "$BUCKET" "h100") echo "------ARGUMENTS for SLURM ---" MASTER_ADDR=${MASTER_ADDR:-localhost} From de15117552a1c3795aa30f0924adf348d91421c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 30 Jan 2026 13:13:56 +0100 Subject: [PATCH 003/231] ci(hotfix): Alert for GB200 (#3168) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/04.functional-tests.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index 1591373e9a6..77298f200c5 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -205,15 +205,15 @@ functional:run_dev_dgx_gb200: functional:run_nemo: extends: [.functional_tests_rules] trigger: - project: 'dl/joc/nemo-ci' + project: "dl/joc/nemo-ci" branch: main-mirror strategy: depend inherit: variables: true variables: MCORE_COMMIT: $CI_COMMIT_SHA - TEST_NEMO2_MODULE: 'True' - ALLOW_FAILURE_DEPENDENCY: 'True' + TEST_NEMO2_MODULE: "True" + ALLOW_FAILURE_DEPENDENCY: "True" TESTS_TO_RUN_ON_THIS_COMMIT: nightly rules: - if: $FUNCTIONAL_TEST == "yes" @@ -229,6 +229,8 @@ functional:x_notify: - functional:run_dev_dgx_a100 - functional:run_lts_dgx_h100 - functional:run_dev_dgx_h100 + - functional:run_lts_dgx_gb200 + - functional:run_dev_dgx_gb200 tags: - arch/amd64 - env/prod From 7952d7eff1ee5e33ff6dd8df9a9695a3c6b3fe59 Mon Sep 17 00:00:00 2001 From: Duncan Riach <33532941+duncanriach@users.noreply.github.com> Date: Fri, 30 Jan 2026 05:03:13 -0800 Subject: [PATCH 004/231] Fix SFTDataset truncation bug (#3158) --- megatron/training/datasets/sft_dataset.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py index 2cbc4e424eb..b313dafb0ec 100644 --- a/megatron/training/datasets/sft_dataset.py +++ b/megatron/training/datasets/sft_dataset.py @@ -142,19 +142,12 @@ def extend_with_padding(tokens, targets, positions, pad_len): # Handle any necessary truncation if len(pack_tokens) >= pack_length + 1: # +1 here to account for later alignment - truncate_left_not_right = True # TODO(duncan): plumb this switch in - if truncate_left_not_right: # Retain existing eod - max_body = pack_length - pack_tokens = pack_tokens[-max_body:] - pack_targets = pack_targets[-max_body:] - pack_tokens.append(pad) - pack_targets.append(pad) - else: # Truncate right (need to add eod) - max_body = pack_length - 1 - pack_tokens = pack_tokens[:max_body] - pack_targets = pack_targets[:max_body] - pack_tokens.extend([eod, pad]) - pack_targets.extend([eod, pad]) + # Truncate on the right + max_body = pack_length + pack_tokens = pack_tokens[:max_body] + pack_targets = pack_targets[:max_body] + pack_tokens.extend(pad) + pack_targets.extend(pad) pack_positions = pack_positions[:pack_length+1] # Note len({pack_tokens, pack_targets, pack_positions}) should be pack_length + 1 cu_seqlens[-1] = len(pack_tokens) - 1 From b9ee19e6ba4900a1fc698324d5c6d5e29a8be879 Mon Sep 17 00:00:00 2001 From: yobi byte Date: Fri, 30 Jan 2026 14:59:25 +0000 Subject: [PATCH 005/231] Vitalyk/multiturn v2 (#3167) --- megatron/rl/agent/api.py | 14 +- megatron/rl/agent/reward_only_agent.py | 8 +- megatron/rl/rl_utils.py | 552 ++++++++++-------- megatron/rl/sequence_packing_utils.py | 128 ++-- megatron/training/arguments.py | 2 - megatron/training/training.py | 16 +- .../golden_values_dev_dgx_h100.json | 120 ++-- .../model_config.yaml | 4 +- tests/unit_tests/rl/test_rl_utils.py | 121 +++- .../rl/test_sequence_packing_utils.py | 56 ++ train_rl.py | 1 + 11 files changed, 606 insertions(+), 416 deletions(-) diff --git a/megatron/rl/agent/api.py b/megatron/rl/agent/api.py index 34efa68d85a..9568db3a54d 100644 --- a/megatron/rl/agent/api.py +++ b/megatron/rl/agent/api.py @@ -46,8 +46,8 @@ class GroupedRolloutRequest(Request): class Rollout(AgentBaseModel): """Data for language-based Rollout.""" - trajectory: str - prompt_length: int | None = None + trajectory: list[str] + prompt_length: list[int] | None = None reward: float = None env_id: str | None = None problem_id: str | None = None @@ -56,10 +56,10 @@ class Rollout(AgentBaseModel): class TokenRollout(AgentBaseModel): """Tokenized representation of a language-based Rollout.""" - trajectory: list[int] + trajectory: list[list[int]] reward: list[float] | float - generation_mask: list[list[int]] | list[bool] | None = None - logprobs: list[float] | None = None + generation_mask: list[list[bool]] | None = None + logprobs: list[list[float]] | None = None env_id: str | None = None problem_id: str | None = None @@ -67,8 +67,8 @@ class TokenRollout(AgentBaseModel): class ContrastiveRollout(AgentBaseModel): """Contrastive/Preference data for language-based Rollout.""" - chosen_trajectory: str - rejected_trajectory: str + chosen_trajectory: list[str] + rejected_trajectory: list[str] class Head2HeadRolloutRequest(Request): diff --git a/megatron/rl/agent/reward_only_agent.py b/megatron/rl/agent/reward_only_agent.py index 2e81674c74d..53b1f7407b2 100644 --- a/megatron/rl/agent/reward_only_agent.py +++ b/megatron/rl/agent/reward_only_agent.py @@ -104,16 +104,16 @@ async def rollout_from_response( for x in range(len(response.token_ids)) ] rollout = TokenRollout( - trajectory=response.token_ids, + trajectory=[response.token_ids], reward=await self.get_reward(response_text, golden), - logprobs=logprobs, - generation_mask=generation_mask, + logprobs=[logprobs], + generation_mask=[generation_mask], env_id=self.env_id, problem_id=golden['problem_id'] if 'problem_id' in golden else None, ) else: rollout = Rollout( - trajectory=raw_text, + trajectory=[raw_text], reward=await self.get_reward(response_text, golden), env_id=self.env_id, problem_id=golden['problem_id'] if 'problem_id' in golden else None, diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 973a396b909..364a80db81e 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -2,18 +2,18 @@ import gc +import copy +from functools import partial # Keep this to make the env registered. import itertools -import json -import logging import math +import logging import pickle from collections import Counter, defaultdict from contextlib import contextmanager, nullcontext from dataclasses import dataclass -from difflib import SequenceMatcher from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional import numpy as np import torch @@ -26,6 +26,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer from megatron.core.full_cuda_graph import FullCudaGraphWrapper from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator from megatron.core.optimizer import MegatronOptimizer from megatron.core.pipeline_parallel import get_forward_backward_func from megatron.core.pipeline_parallel.utils import is_pp_last_stage, get_pp_last_rank @@ -45,10 +46,10 @@ compute_packed_inference_logprobs_stats, pack_all_trajectories, load_packed_data_by_index, - update_sequence_packing_metrics, get_sequence_packing_tensorboard_metrics, get_sequence_packing_log_info, get_default_packed_seq_params, + update_microbatch_calculator, ) from megatron.rl.agent.api import ( EvaluationRequest, @@ -66,7 +67,6 @@ from megatron.training.global_vars import ( get_args, get_tensorboard_writer, - get_timers, get_tokenizer, get_wandb_writer, ) @@ -209,13 +209,13 @@ def verify_model_weights_swap( if inf_was_training: inf_core.train() -GroupedRollouts = list[list[TokenRollout | Rollout]] +Rollouts = list[TokenRollout | Rollout] +GroupedRollouts = list[Rollouts] @dataclass(slots=True) class RolloutStats: mean_reward: float - mean_sim: None | float mean_length: float mean_length_std: float max_length: float @@ -233,6 +233,7 @@ class RolloutStats: min_inf_prob: None | float max_inf_prob: None | float mean_inf_prob: None | float + num_turns: list[int] # num_turns per traj # Runtime state container for RL-specific data that shouldn't be checkpointed @@ -242,7 +243,6 @@ class RLRuntimeState: def __init__(self): self.packing_context = None self.last_collection_iteration = 0 - self.global_batches_per_collection = 0 self.sequences_this_iteration_on_rank = 0 self.latest_batch_num_sequences = 0 @@ -613,20 +613,20 @@ def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_packing=Fa """ + args = get_args() # Ensure packed_seq_params is always provided for CUDA graph signature consistency if packed_seq_params is None and sequence_packing: packed_seq_params = get_default_packed_seq_params( seq_length=tokens.shape[1], + max_sequences_per_bin=args.rl_sequence_packing_max_sequences_per_bin, device=tokens.device, ) nvtx_range = get_nvtx_range() with nvtx_range("get-logprobs", time=False): - with nvtx_range("forward-pass", time=False): # TODO(vitalyk): use fp16/bf16 as a function argument. Do not use args. - args = get_args() attention_mask_for_forward = None @@ -658,19 +658,46 @@ def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_packing=Fa return logprobs +def calculate_grpo_advantages(rewards: list[list[float]], num_turns: list[list[int]]) -> np.ndarray: + """Calculate GRPO advantages from rewards/num_turns. + + For multiturn rollouts, the logic is a bit more involved. + # For training, we'll be turning each turn into a trajectory with the same reward + # within a trajectory, e.g. if [[a,b],[c,d,e]] trajectory has reward 1.0, we will + # get [a,b] with 1.0 and [c,d,e] with 1.0 when doing updates. + """ + + rewards = np.array(rewards) + + num_turns = np.array(num_turns) + # Each outer dimension of num_turns is a group. Sum of those gives total num_turns per group. + # Let's use this to calculate advantage. + # mean/std should be repeated based on group lens + group_turns = num_turns.sum(axis=-1) + reward_means = rewards.mean(axis=1, keepdims=True).repeat(group_turns) + reward_stds = rewards.std(axis=1, keepdims=True).repeat(group_turns) + + # rewards are originally [g, group_size] + # Making an assumption that all groups are of the same size! + # @vitalyk: this will go away when we start sending env-based sample reqs. + rewards = rewards.flatten().repeat(num_turns.flatten()) + + return ((rewards - reward_means) / (1e-4 + reward_stds)).tolist() + + def compute_group_stats( - rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer + rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, seq_len: int, ) -> RolloutStats: """Add group-based rollout stats for logging. Args: rollouts: Rollouts to generate the stats for. Each inner list is a group (as in GRPO group), i.e. all rollouts are for the same prompt. tokenizer: Tokenizer to tokenize the rollouts in case they are raw strings. + seq_len: Maximum sequence length. Returns: RolloutStats object containing all the stats. """ - args = get_args() # TODO (rkirby) Maybe do some of this after the tensor building group_reward_means = [] group_reward_stds = [] @@ -678,54 +705,45 @@ def compute_group_stats( group_length_stds = [] group_length_maxs = [] group_length_mins = [] - group_rollout_similarities = [] + rewards = [] + num_turns = [] # num_turns per traj for group in rollouts: group_rewards = [] group_lengths = [] + group_num_turns = [] for rollout in group: + group_num_turns.append(len(rollout.trajectory)) if isinstance(rollout, TokenRollout): - lang_rl_log( - f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} tokens] {tokenizer.detokenize(rollout.trajectory)}" - ) - assert (len(rollout.trajectory) == args.seq_length) or ( - rollout.trajectory[-1] == tokenizer.eod - ), f"Rollout is not the correct length: {len(rollout.trajectory)} {rollout.trajectory[-1]}\n{tokenizer.detokenize(rollout.trajectory)}" + for turn_traj in rollout.trajectory: + detokenized_traj = tokenizer.detokenize(turn_traj) + lang_rl_log( + f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} tokens] {detokenized_traj}" + ) + # TODO(vitalyk): how does multiturn change EOD/EOT? + assert (len(turn_traj) == seq_len) or ( + turn_traj[-1] == tokenizer.eod + ), f"Rollout is not the correct length: {len(turn_traj)} {turn_traj[-1]}\n{detokenized_traj}" else: lang_rl_log( f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} chars] {rollout.trajectory}" ) group_rewards.append(rollout.reward) - group_lengths.append(len(rollout.trajectory)) - if args.rl_calculate_intra_group_similarity: - # We can probably compute this outside, but in case we switch to different group sizes for different envs, let's keep it here. - combos = itertools.combinations(range(len(group)), 2) - # For every pair (excluding ourselves), check the sequence similarity and log. - # Use this to track the diversity of generated rollouts within a group. - intra_group_sim = np.mean( - list( - map( - lambda idx_pair: SequenceMatcher( - None, group[idx_pair[0]].trajectory, group[idx_pair[1]].trajectory - ).ratio(), - combos, - ) - ) - ) - group_rollout_similarities.append(intra_group_sim) - else: - group_rollout_similarities = None + #TODO(vitalyk): What is the semantics behind traj length in multiturn? Should we take the last only? Average them instead of extending? + group_lengths.extend(len(t) for t in rollout.trajectory) group_length_maxs.append(max(group_lengths)) group_length_mins.append(min(group_lengths)) group_reward_means.append(np.mean(group_rewards)) group_reward_stds.append(np.std(group_rewards)) + rewards.append(group_rewards) group_length_means.append(np.mean(group_lengths)) # https://arxiv.org/abs/2504.21233 reports that lens variants hurts. # Let's track this. group_length_stds.append(np.std(group_lengths)) + num_turns.append(group_num_turns) + stats = RolloutStats( mean_reward=np.mean(group_reward_means), - mean_sim=np.mean(group_rollout_similarities) if group_rollout_similarities else None, mean_length=np.mean(group_length_means), mean_length_std=np.mean(group_length_stds), max_length=np.max(group_length_maxs), @@ -741,8 +759,9 @@ def compute_group_stats( min_inf_prob=None, max_inf_prob=None, mean_inf_prob=None, - rewards=None, # We will fill those in later in prepare_data_for_update. - advantages=None, # We will fill those in later in prepare_data_for_update. + rewards=[r for group in rewards for r in group], + advantages=calculate_grpo_advantages(rewards, num_turns), + num_turns=[nt for group in num_turns for nt in group], ) return stats @@ -813,11 +832,10 @@ def maybe_log_training_metrics( columns=['Trajectories', 'Tokens', 'Rewards'], rows=[ [ - ( - tokenizer.detokenize(r.trajectory) + [(tokenizer.detokenize(turn) if isinstance(r, TokenRollout) - else r.trajectory - ), + else turn) for turn in r.trajectory + ], r.trajectory, r.reward, ] @@ -825,11 +843,6 @@ def maybe_log_training_metrics( ], ), }, - **( - {'mean_intra_group_similarity': group_stats.mean_sim} - if group_stats.mean_sim - else {} - ), }, step=current_iteration, ) @@ -838,10 +851,9 @@ def maybe_log_training_metrics( def prepare_trajectories( - rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, seq_length: int + rollouts: Rollouts, tokenizer: MegatronLegacyTokenizer, seq_length: int, sequence_packing: bool, skip_bos_token: bool ): """Pad trajectories and extract the generation masks. - Args: rollouts: Rollouts to extract trajectories from. tokenizer: Tokenizer to get the padding token and potentially tokenize. @@ -858,6 +870,7 @@ def prepare_trajectories( DEFAULT_PAD_TOKENS = ['<|finetune_right_pad_id|>'] + if isinstance(tokenizer, _HuggingFaceTokenizer): if not tokenizer.pad: for pad_token in DEFAULT_PAD_TOKENS: @@ -888,17 +901,19 @@ def prepare_trajectories( trajs = [] generation_masks = [] inference_logprobs = [] - for group in rollouts: - for rollout in group: - generation_mask = rollout.generation_mask if isinstance(rollout, TokenRollout) else None - - trajectory = ( - rollout.trajectory.copy() - if isinstance(rollout, TokenRollout) - else tokenizer.tokenize(rollout.trajectory) - ) - inf_logprobs = rollout.logprobs - + for rollout in rollouts: + # traj, gen mask and logprobs are lists now. + # each list entry is a turn, single-turn environments just have a single-element list. + # We assume that all lengths of the structs above have the same lengths (number of turns). + + all_turns_trajectories = ( + copy.deepcopy(rollout.trajectory) + if isinstance(rollout, TokenRollout) + else tokenizer.tokenize(rollout.trajectory) + ) + for turn_idx, trajectory in enumerate(all_turns_trajectories): + inf_logprobs = rollout.logprobs[turn_idx] + generation_mask = rollout.generation_mask[turn_idx] if isinstance(rollout, TokenRollout) else None length = len(trajectory) assert length <= seq_length, "Rollout too long, how did this happen?" if len(trajectory) < seq_length: @@ -920,8 +935,7 @@ def prepare_trajectories( else: inference_logprobs.append(None) - env_id = rollout.env_id - env_id_counts[env_id] += 1 + env_id_counts[rollout.env_id] += 1 if torch.distributed.is_initialized(): logger.info(f"[{dist.get_rank()}] Rollout counts:") @@ -931,34 +945,18 @@ def prepare_trajectories( generation_masks = torch.tensor(generation_masks, dtype=torch.bool, device='cpu') trajs = torch.tensor(trajs, device='cpu') - args = get_args() # Only process if we have inference_logprobs if inference_logprobs and any(lp is not None for lp in inference_logprobs): - if args.rl_use_sequence_packing: - # For sequence packing, we need to pad all logprobs to the same size - padded_logprobs = [] - for logprobs in inference_logprobs: - if logprobs is not None: - if len(logprobs) < seq_length: - # Pad with zeros (these positions will be masked anyway) - padding_size = seq_length - len(logprobs) - padded = torch.nn.functional.pad(logprobs, (0, padding_size), value=0.0) - padded_logprobs.append(padded) - else: - padded_logprobs.append(logprobs) - else: - # Create zero tensor for None logprobs - padded_logprobs.append(torch.zeros(seq_length)) - inference_logprobs = torch.stack(padded_logprobs) - else: - # For non-packing mode, keep as list of tensors (unpadded) - # This preserves the original behavior where each sequence can have different lengths - pass + # We need to pad all logprobs to the same size for sequence packing. + # For non-packing mode, keep as list of tensors (unpadded) + # This preserves the original behavior where each sequence can have different lengths + if sequence_packing: + inference_logprobs = _pad_nonnull_with_zeros(inference_logprobs, seq_length) else: inference_logprobs = None # Some sanity checks regarding the tokenization - if not args.rl_skip_bos_token: + if not skip_bos_token: assert ( tokenizer.bos is None or (trajs[:, 0] == tokenizer.bos).all() ), "First token should be bos" @@ -979,11 +977,92 @@ def prepare_trajectories( return trajs, generation_masks, inference_logprobs +def logprobs_forward_step(data_iterator, model, is_correction, packing_context=None): + # Avoid self.training checks which will trigger cudagraph capture; this path reuses + # the forward pass from training after it has been captured on the 1st iteration. + model.eval() + + if packing_context is not None: + # When using sequence packing, the data iterator returns a tuple with a single element, the bin index. + bin_tensor = next(data_iterator)[0] + #TODO(jalbericiola): change for named tuple + (b_trajs, _, _, _, b_posids, _, _, _, _, _, b_packed_seq_params) = ( + load_packed_data_by_index(bin_tensor.item(), packing_context, is_correction) + ) + else: + b_trajs, b_posids = next(data_iterator) + b_packed_seq_params = None + + logprobs = ( + get_logprobs( + model, + b_trajs.cuda(), + b_posids.cuda(), + no_grad=True, + sequence_packing=b_packed_seq_params is not None, + packed_seq_params=b_packed_seq_params, + ), + None, + ) + model.train() + return logprobs + + +def _compute_logprobs_batch( + model, + data_loader, + forward_backward_func, + packing_context, + trajs_batch_size, # n_bins for seq packing, and batch_size for non seq packing + seq_length, + logprobs_batch_size, + decoder_seq_length, + dtype, + pp_group, + is_correction, +): + """Compute logprobs for all batches in the data loader.""" + logprobs_list = [] + data_iterator = iter(data_loader) + for i in range(len(data_loader)): + output_tensor = forward_backward_func( + forward_step_func=partial(logprobs_forward_step, is_correction=is_correction, packing_context=packing_context), + data_iterator=data_iterator, + model=model, + num_microbatches=1, + seq_length=seq_length, + micro_batch_size=logprobs_batch_size, + decoder_seq_length=decoder_seq_length, + forward_only=True, + adjust_tensor_shapes_fn=None, + ) + if is_pp_last_stage(pp_group): + logprobs_list.append(output_tensor[0].detach()) + + if is_pp_last_stage(pp_group): + logprobs = torch.concat(logprobs_list, dim=0) + assert logprobs.dtype == dtype + else: + logprobs = torch.empty( + trajs_batch_size, + seq_length-1, + dtype=dtype, + device=torch.cuda.current_device(), + ) + + # Only PP>1 needs a broadcast from the last stage; for PP=1 the output is already local. + if get_pg_size(pp_group) > 1: + dist.broadcast(logprobs, src=get_pp_last_rank(pp_group), group=pp_group) + return logprobs.cpu() + + def prepare_data_for_update( model: list[LanguageModule], ref_state_dict: Dict[str, Any], rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, + sequence_packing: bool, + is_correction: bool, ) -> RerunDataIterator: """Extract data for the update from raw rollouts. @@ -992,6 +1071,8 @@ def prepare_data_for_update( ref_state_dict: Reference policy state dict. rollouts: Rollouts to extract the data from. tokenizer: Tokenizer to pad/tokenize data. + sequence_packing: Use sequence packing if True. + is_correction: Prepare data for IS correction if True. Returns: Cycled iterator over dataset batches. In GRPO we might want to go over the same data multiple times. @@ -1013,59 +1094,50 @@ def prepare_data_for_update( with nvtx_range("prepare-data-for-update"): with nvtx_range("compute-group-stats"): - # These are computed on all rollouts for reporting purposes - group_stats = compute_group_stats(rollouts, tokenizer) - rewards = np.array([[rollout.reward for rollout in group] for group in rollouts]) - group_stats.rewards = rewards.flatten().tolist() - group_stats.advantages = ( - ( - (rewards - rewards.mean(axis=1, keepdims=True)) - / (1e-4 + rewards.std(axis=1, keepdims=True)) - ) - .flatten() - .tolist() - ) - global_rollout_count = len(group_stats.rewards) - - with nvtx_range("prepare_advantages", time=True): - # [g, group_size] - # Making an assumption that all groups are of the same size! - rewards = torch.tensor(rewards, device='cpu') - advantages = (rewards - rewards.mean(axis=1, keepdim=True)) / ( - 1e-4 + rewards.std(axis=1, keepdim=True) - ) - - # Flatten advantages for training and move to GPU - advantages = global_advantages = advantages.view(-1).cuda() + group_stats = compute_group_stats(rollouts, tokenizer, args.seq_length) + # TODO(vitalyk): why do we need global_advantages here? go inside packing + advantages = global_advantages = torch.tensor(group_stats.advantages, dtype=dtype).cuda() # Now split the rollouts across the data parallel ranks for training # This needs to be done at this point because we are about to calculate logprobs # Note :- For EP, do not use the expert data parallel group here. Always # use the regular data parallel group. + + # Use one group as an exampling for logging later. + example_group = rollouts[0] + + # Let's expand rollouts getting rid of the groups. + # We need this to correctly split the rollouts across dp groups. + # And we do not actually need them grouped in anything below anyways. + rollouts = [r for g in rollouts for r in g] + total_turns_sampled = len(rollouts) + + # We might sample more than we consume in one step. + samples_ratio_per_step = args.global_batch_size / (args.grpo_prompts_per_step * args.grpo_group_size) + assert samples_ratio_per_step <= 1, "You cannot use more data than you sampled." + if (data_parallel_world_size := mpu.get_data_parallel_world_size()) > 0: data_split_size = len(rollouts) // data_parallel_world_size data_split_range = ( mpu.get_data_parallel_rank() * data_split_size, (mpu.get_data_parallel_rank() + 1) * data_split_size, ) + # TODO(vitalyk): This has to be rewritten assuming we are multiturn now. rollouts = rollouts[data_split_range[0] : data_split_range[1]] + local_num_turns = sum(group_stats.num_turns[data_split_range[0] : data_split_range[1]]) + steps_before = sum(group_stats.num_turns[:data_split_range[0]]) + advantages = advantages[steps_before:steps_before+local_num_turns] # First we calculate them on a global level and then we split and recalculate on a local level. # Sequence packing and reporting needs it global but non-packing wants it local. - rewards = torch.tensor([[r.reward for r in group] for group in rollouts], device='cpu') - advantages = (rewards - rewards.mean(axis=1, keepdim=True)) / ( - 1e-4 + rewards.std(axis=1, keepdim=True) - ) - - # Flatten advantages for training and move to GPU - advantages = advantages.view(-1).cuda() with nvtx_range("prepare_trajectories"): trajs, generation_masks, inference_logprobs = prepare_trajectories( - rollouts, tokenizer, args.seq_length + rollouts, tokenizer, args.seq_length, sequence_packing, args.rl_skip_bos_token ) + packing_context = None # Build trajectories based on sequence packing or standard processing - if args.rl_use_sequence_packing: + if sequence_packing: with nvtx_range("sequence_packing", time=True): runtime_state.packing_context = packing_context = pack_all_trajectories( trajs, @@ -1105,7 +1177,6 @@ def prepare_data_for_update( ) logprobs_batch_size = args.micro_batch_size - with torch.no_grad(), nvtx_range("compute_logprobs", time=True): # Before we can update the model, we need to get the logprobs for the \pi_{old} model. @@ -1116,41 +1187,6 @@ def prepare_data_for_update( forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps ) - def logprobs_forward_step(data_iterator, model): - - # Avoid self.training checks which will trigger cudagraph capture; this path reuses - # the forward pass from training after it has been captured on the 1st iteration. - model.eval() - - if args.rl_use_sequence_packing: - # When using sequence packing, the data iterator returns a tuple with a single element, the bin index. - bin_tensor = next(data_iterator)[0] - #TODO(jalbericiola): change for named tuple - (b_trajs, _, _, _, b_posids, _, _, _, _, _, b_packed_seq_params) = ( - load_packed_data_by_index(bin_tensor.item(), packing_context, args.rl_inference_logprobs_is_correction) - ) - else: - batch_data = next(data_iterator) - b_trajs, b_posids = batch_data - b_packed_seq_params = None - - b_trajs = b_trajs.cuda() - b_posids = b_posids.cuda() - - logprobs = ( - get_logprobs( - model, - b_trajs, - b_posids, - no_grad=True, - sequence_packing=args.rl_use_sequence_packing, - packed_seq_params=b_packed_seq_params, - ), - None, - ) - - model.train() - return logprobs dtype = ( torch.bfloat16 if args.bf16 else (torch.float16 if args.fp16 else torch.float32) @@ -1159,43 +1195,20 @@ def logprobs_forward_step(data_iterator, model): pg_collection = get_attr_wrapped_model(model, "pg_collection") pp_group = pg_collection.pp - def _compute_logprobs_batch(): - """Compute logprobs for all batches in the data loader.""" - logprobs_list = [] - data_iterator = iter(data_loader) - for i in range(len(data_loader)): - output_tensor = forward_backward_func( - forward_step_func=logprobs_forward_step, - data_iterator=data_iterator, - model=model, - num_microbatches=1, - seq_length=args.seq_length, - micro_batch_size=logprobs_batch_size, - decoder_seq_length=args.decoder_seq_length, - forward_only=True, - adjust_tensor_shapes_fn=None, - ) - if is_pp_last_stage(pp_group): - logprobs_list.append(output_tensor[0].detach()) - - if is_pp_last_stage(pp_group): - logprobs = torch.concat(logprobs_list, dim=0) - assert logprobs.dtype == dtype - else: - logprobs = torch.empty( - len(compute_trajs), - args.seq_length - 1, - dtype=dtype, - device=torch.cuda.current_device(), - ) - - # Only PP>1 needs a broadcast from the last stage; for PP=1 the output is already local. - if get_pg_size(pp_group) > 1: - dist.broadcast(logprobs, src=get_pp_last_rank(pp_group), group=pp_group) - return logprobs.cpu() - with torch.no_grad(), nvtx_range("compute_old_logprobs", time=True): - old_logprobs = _compute_logprobs_batch() + old_logprobs = _compute_logprobs_batch( + model=model, + data_loader=data_loader, + forward_backward_func=forward_backward_func, + packing_context=packing_context, + trajs_batch_size=len(compute_trajs), + seq_length=args.seq_length, + logprobs_batch_size=logprobs_batch_size, + decoder_seq_length=args.decoder_seq_length, + dtype=dtype, + pp_group=pp_group, + is_correction=args.rl_inference_logprobs_is_correction, + ) with torch.no_grad(), nvtx_range("compute_ref_logprobs", time=True): # We need to load the ref model state dict and compute the logprobs for the ref model @@ -1203,8 +1216,19 @@ def _compute_logprobs_batch(): k: (v.cpu() if v is not None else v) for k, v in model.state_dict().items() } model.load_state_dict(ref_state_dict) - - ref_logprobs = _compute_logprobs_batch() + ref_logprobs = _compute_logprobs_batch( + model=model, + data_loader=data_loader, + forward_backward_func=forward_backward_func, + packing_context=packing_context, + trajs_batch_size=len(compute_trajs), + seq_length=args.seq_length, + logprobs_batch_size=logprobs_batch_size, + decoder_seq_length=args.decoder_seq_length, + dtype=dtype, + pp_group=pp_group, + is_correction=args.rl_inference_logprobs_is_correction, + ) # logprobs are [b, seq, h] now. model.load_state_dict(cur_st_dict) @@ -1214,7 +1238,7 @@ def _compute_logprobs_batch(): torch.cuda.empty_cache() - if args.rl_use_sequence_packing: + if sequence_packing: with nvtx_range("pack_logprobs", time=True): # Store logprobs on gpu in packing context # Since PackingContext is a dataclass, we add these as new attributes @@ -1243,6 +1267,22 @@ def _compute_logprobs_batch(): packing_context.packed_inference_logprobs = packed_inference_logprobs.cuda() # Only mark as having inference logprobs for IS correction if enabled packing_context.has_inference_logprobs = args.rl_inference_logprobs_is_correction + with nvtx_range("create_dataloader"): + # @vitalyk: This function also reconfigures the data loader to count the + # global_batch_size in the bins frame of reference. + # I think it will be a better design if we split the data loader creating and logic + # that reconfigures the microbatch calculator. + + update_microbatch_calculator( + samples_ratio_per_step=samples_ratio_per_step, + num_bins_this_rank = len(packing_context.packed_trajs), + bin_seq_indices = packing_context.packing_info.bin_seq_indices, + global_batch_size=args.global_batch_size, + rampup_batch_size=args.rampup_batch_size, + micro_batch_size=args.micro_batch_size, + decrease_batch_size_if_needed=args.decrease_batch_size_if_needed, + ) + loader = get_microbatch_dataloader(len(packing_context.packed_trajs), args.micro_batch_size) else: with nvtx_range("align_inference_logprobs", time=True): if inference_logprobs is not None: @@ -1257,14 +1297,20 @@ def _compute_logprobs_batch(): # Nullify logprobs if not used in IS correction, if not args.rl_inference_logprobs_is_correction: inference_logprobs = None - - with nvtx_range("create_dataloader"): - if args.rl_use_sequence_packing: - loader, optimizer_steps = get_microbatch_dataloader(packing_context) - runtime_state.global_batches_per_collection = optimizer_steps - else: + with nvtx_range("create_dataloader"): + # Because of multiturn, our batch sizes for non-sequence packed trajectories are not fixed anymore. + # As in sequence packing above, we need to reconfigure it too. runtime_state.packing_context = None - runtime_state.global_batches_per_collection = global_rollout_count / args.global_batch_size + + reconfigure_num_microbatches_calculator( + rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0, + global_batch_size=math.ceil(samples_ratio_per_step*total_turns_sampled), + rampup_batch_size=args.rampup_batch_size, + micro_batch_size=args.micro_batch_size, + decrease_batch_size_if_needed=args.decrease_batch_size_if_needed, + data_parallel_size=mpu.get_data_parallel_world_size(), + ) + dataset_tensors = [ compute_trajs, advantages, @@ -1273,20 +1319,20 @@ def _compute_logprobs_batch(): original_position_ids, ref_logprobs, ] - if args.rl_inference_logprobs_is_correction and inference_logprobs is not None: + if is_correction and inference_logprobs is not None: dataset_tensors.append(inference_logprobs) else: dataset_tensors.append(torch.zeros_like(old_logprobs)) - data = TensorDataset(*dataset_tensors) loader = DataLoader(data, batch_size=args.micro_batch_size) + with nvtx_range("log-wandb-tb"): maybe_log_training_metrics( group_stats=group_stats, current_iteration=args.curr_iteration, tokenizer=tokenizer, - example_group=rollouts[0], + example_group=example_group, wandb_writer=wandb_writer, tb_writer=tb_writer, ) @@ -1294,66 +1340,65 @@ def _compute_logprobs_batch(): return RerunDataIterator(itertools.cycle(loader)) -def get_rollout_data_iterator( - model: LanguageModule, - inference_model: LanguageModule | None, - optimizer: MegatronOptimizer, - iteration: int, - ref_state_dict: Dict[str, torch.Tensor], -) -> RerunDataIterator: - - args = get_args() - tokenizer = get_tokenizer() - - buffered_rollouts = get_environment_rollouts( - model, inference_model, optimizer, args.grpo_prompts_per_step, args.grpo_group_size - ) - buffered_rollouts = prepare_data_for_update(model, ref_state_dict, buffered_rollouts, tokenizer) - - return buffered_rollouts - - -def setup_grpo_data_iterator( +def get_grpo_data_iterator( model: LanguageModule, inference_model: LanguageModule | None, optimizer: MegatronOptimizer, iteration: int, ref_state_dict: Dict[str, torch.Tensor], + grpo_iterations: int, + grpo_prompts_per_step: int, + grpo_group_size: int, + global_batch_size: int, + sequence_packing: bool, + is_correction: bool, buffered_rollouts: RerunDataIterator | None = None, ) -> RerunDataIterator: """ - Set up the data iterator for GRPO training. + Get the data iterator for GRPO training. + + Depending on the sampling parameters either performs data collections or returns + the buffered_rollouts as is. Args: model: The language model optimizer: The Megatron optimizer iteration: Current training iteration ref_state_dict: Reference model state dict for GRPO + grpo_iterations: How many steps we reuse the sampled data for. + grpo_prompts_per_step: How many prompts we sample per data collection. + grpo_group_size: How many samples we do per prompt. + global_batch_size: Global batch size. + sequence_packing: Use sequence packing if True. + is_correction: Use IS correction if True. buffered_rollouts: Previously collected rollouts (if any) Returns: RerunDataIterator for the current training step """ - args = get_args() runtime_state = get_rl_runtime_state() - if inference_model is not None: - inference_pg_collection = unwrap_model(inference_model[0]).pg_collection - else: - inference_pg_collection = ProcessGroupCollection.use_mpu_process_groups() - # We collect new rollouts when we've gone over the collected data 'grpo_iterations' times. + global_batches_per_collection = (grpo_prompts_per_step * grpo_group_size) // global_batch_size if ( buffered_rollouts is None or iteration == runtime_state.last_collection_iteration + - (args.grpo_iterations * runtime_state.global_batches_per_collection) + (grpo_iterations * global_batches_per_collection) ): - train_data_iterator = get_rollout_data_iterator(model,inference_model, optimizer, iteration, ref_state_dict) + + buffered_rollouts = get_environment_rollouts( + model, inference_model, optimizer, grpo_prompts_per_step, grpo_group_size + ) + buffered_rollouts = prepare_data_for_update(model=model, + ref_state_dict=ref_state_dict, + rollouts=buffered_rollouts, + tokenizer=get_tokenizer(), + sequence_packing=sequence_packing, + is_correction=is_correction, + ) runtime_state.reset_iteration_counters(iteration) - else: - train_data_iterator = buffered_rollouts - return train_data_iterator + return buffered_rollouts def evaluate_and_print_results_rl( @@ -1393,7 +1438,7 @@ def evaluate_and_print_results_rl( rank = torch.distributed.get_rank() if rank == 0: - logger.info(f"Collecting evaluation results...") + logger.info("Collecting evaluation results...") agent = get_agent(args) request = EvaluationRequest( inference_interface=inference_interface, @@ -1708,3 +1753,32 @@ def get_iteration_sequence_count(args): if torch.distributed.is_initialized(): torch.distributed.all_reduce(sequences_tensor, group=mpu.get_data_parallel_group()) return int(sequences_tensor.item()) + +def _pad_nonnull_with_zeros(data: list[Optional[torch.Tensor]], max_len: int) -> torch.Tensor: + """Pad each element of a list of tensors to the length required. + Args: + data: List of tensors to pad. + max_len: Maximum length to pad to. Must be higher or equal than the max len of the data tensors. + Returns: + A padded tensor which is a stacked list of padded input tensors. + + """ + if all([el is None for el in data]): + raise ValueError("At least one element of the data list should be not None.") + padded_data = [] + for chunk in data: + if chunk is not None: + padding_size = max_len - len(chunk) + if padding_size > 0: + # Pad with zeros (these positions will be masked anyway) + padded = torch.nn.functional.pad(chunk, (0, padding_size), value=0.0) + padded_data.append(padded) + elif padding_size == 0: + padded_data.append(chunk) + else: + raise ValueError("One of the input tensors has larger length than padding max len.") + else: + # Create zero tensor for None logprobs + padded_data.append(torch.zeros(max_len)) + return torch.stack(padded_data) + diff --git a/megatron/rl/sequence_packing_utils.py b/megatron/rl/sequence_packing_utils.py index a5703a4580c..4d983764f77 100644 --- a/megatron/rl/sequence_packing_utils.py +++ b/megatron/rl/sequence_packing_utils.py @@ -10,7 +10,6 @@ from megatron.training.global_vars import get_args, get_tokenizer from megatron.training.utils import get_nvtx_range from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.num_microbatches_calculator import get_num_microbatches from megatron.core import mpu import logging import typing @@ -78,7 +77,6 @@ def load_packed_data_by_index(bin_idx: int, packing_context: PackingContext, log Args: bin_idx: Index of the bin to load. """ - args = get_args() # Get packing context (should always be available in packed mode) idx = slice(bin_idx, bin_idx + 1) @@ -156,9 +154,8 @@ def log_packing_efficiency(packing_context: PackingContext): packing_efficiency = my_tokens / total_capacity if total_capacity > 0 else 0 avg_seq_length = total_tokens / len(packing_info.seq_lengths) rank = mpu.get_data_parallel_rank() - data_parallel_world_size = mpu.get_data_parallel_world_size() - log_single_rank(logger, logging.INFO, f"[Sequence Packing] Statistics:") + log_single_rank(logger, logging.INFO, "[Sequence Packing] Statistics:") log_single_rank( logger, logging.INFO, @@ -269,7 +266,7 @@ def log_packing_efficiency(packing_context: PackingContext): log_single_rank( logger, logging.INFO, - f"[Sequence Packing] Round-robin distribution quality:", + "[Sequence Packing] Round-robin distribution quality:", ) log_single_rank( logger, @@ -398,7 +395,7 @@ def create_empty_bins( empty_packing_info_entries, ) -def get_default_packed_seq_params(seq_length: int, device: torch.device) -> PackedSeqParams: +def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin: int, device: torch.device) -> PackedSeqParams: """Create a default PackedSeqParams that acts as no-op for a single sequence. This ensures CUDA graph signature consistency when packed_seq_params @@ -407,6 +404,7 @@ def get_default_packed_seq_params(seq_length: int, device: torch.device) -> Pack Args: seq_length: The sequence length + max_sequences_per_bin: Max sequences to pack in a bin. device: Device to create tensors on. Returns: @@ -416,7 +414,7 @@ def get_default_packed_seq_params(seq_length: int, device: torch.device) -> Pack args = get_args() # Pad to the maximum number of sequences in the bin for the attention kernel. - cu_seqlens = torch.full((args.rl_sequence_packing_max_sequences_per_bin,), seq_length, dtype=torch.int32, device=device) + cu_seqlens = torch.full((max_sequences_per_bin,), seq_length, dtype=torch.int32, device=device) cu_seqlens[0] = 0 return PackedSeqParams( @@ -774,7 +772,7 @@ def pack_sequences( seq_per_bin = [len(indices) for indices in packing_info.bin_seq_indices] log_single_rank( - logger, logging.DEBUG, (f"Initial packing output (before distribution):") + logger, logging.DEBUG, ("Initial packing output (before distribution):") ) log_single_rank( logger, @@ -969,33 +967,20 @@ def distribute_packed_bins( def pack_all_trajectories(trajs, generation_masks, inference_logprobs, global_advantages, bin_size, max_sequences_per_bin, packing_algo): tokenizer = get_tokenizer() data_parallel_world_size = mpu.get_data_parallel_world_size() + data_parallel_group = mpu.get_data_parallel_group() nvtx_range = get_nvtx_range() with nvtx_range("regather_trajectories", time=True): - # Regather trajectories from all ranks for packing - trajs = trajs.cuda() - trajs_list = [torch.empty_like(trajs) for _ in range(data_parallel_world_size)] - torch.distributed.all_gather( - trajs_list, trajs, group=mpu.get_data_parallel_group() - ) - trajs = torch.cat(trajs_list, dim=0) - - # Gather all generation masks - generation_masks = generation_masks.cuda() - masks_list = [torch.empty_like(generation_masks) for _ in range(data_parallel_world_size)] - torch.distributed.all_gather( - masks_list, generation_masks, group=mpu.get_data_parallel_group() - ) - generation_masks = torch.cat(masks_list, dim=0) - - # Gather inference logprobs if present + def _gather(data): + data = data.cuda() + data_list = [torch.empty_like(data) for _ in range(data_parallel_world_size)] + torch.distributed.all_gather(data_list, data, group=data_parallel_group) + return torch.cat(data_list, dim=0) + + trajs = _gather(trajs) + generation_masks = _gather(generation_masks) if inference_logprobs is not None: - inference_logprobs = inference_logprobs.cuda() - logprobs_list = [torch.empty_like(inference_logprobs) for _ in range(data_parallel_world_size)] - torch.distributed.all_gather( - logprobs_list, inference_logprobs, group=mpu.get_data_parallel_group() - ) - inference_logprobs = torch.cat(logprobs_list, dim=0) + inference_logprobs = _gather(inference_logprobs) with nvtx_range("pack_sequences", time=True): # Create packer with max sequences per bin limit to prevent extreme imbalance @@ -1073,53 +1058,63 @@ def pack_all_trajectories(trajs, generation_masks, inference_logprobs, global_ad return packing_context +def update_microbatch_calculator( + samples_ratio_per_step: float, + num_bins_this_rank: int, + bin_seq_indices: List[List[int]], + global_batch_size: int, + rampup_batch_size: int, + micro_batch_size: int, + decrease_batch_size_if_needed: bool, +): + """Return a data loader with seqpacked indices with microbatches in bins frame of reference. + Args: + samples_ratio_per_step: Fraction of sampled trajectories to use per iteration. + num_bins_this_rank: Amount of packing bins that belongs to current rank. + bin_seq_indices: Global seq indices in the bin, see PackingInfo. + global_batch_size: Current global batch size. + rampup_batch_size: Rampup batch size. See num_microbatches_calculator.py for more. + micro_batch_size: Micro batch size at init. + decrease_batch_size_if_needed: Scale down batch size. See num_microbatches_calculator.py for more. + + As a side effect, we calculate the global batch size in the bins frame of reference. + In sequence packing, our batch dimension shrinks as we move some trajs onto free + space in sequence dimension. The resulting batch size is what we return here. + """ -def get_microbatch_dataloader(packing_context: PackingContext) -> Tuple[DataLoader, int]: - args = get_args() - num_bins_this_rank = len(packing_context.packed_trajs) dp_world_size = mpu.get_data_parallel_world_size() - # Ratio of collected sequences to the global batch size - pct_of_sequences_per_batch = len(packing_context.packing_info.seq_lengths) / args.global_batch_size - # Ceiling division means we will reuse some bins # If we did floor we would leave some behind - local_bins_per_step = math.ceil(pct_of_sequences_per_batch * num_bins_this_rank) - effective_global_batch_size = local_bins_per_step * dp_world_size + local_bins_per_step = math.ceil(samples_ratio_per_step * num_bins_this_rank) - # Store packing plan in runtime state for the training loop to use - optimizer_steps = -(-num_bins_this_rank // local_bins_per_step) + bins_bs = local_bins_per_step * dp_world_size old_num_microbatches = get_num_microbatches() - reconfigure_num_microbatches_calculator( rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0, - rampup_batch_size=args.rampup_batch_size, - global_batch_size=effective_global_batch_size, - micro_batch_size=args.micro_batch_size, + rampup_batch_size=rampup_batch_size, + global_batch_size=bins_bs, + micro_batch_size=micro_batch_size, data_parallel_size=dp_world_size, - decrease_batch_size_if_needed=args.decrease_batch_size_if_needed, + decrease_batch_size_if_needed=decrease_batch_size_if_needed, ) - new_num_microbatches = get_num_microbatches() log_single_rank( - logger, logging.INFO, f"[Sequence Packing] Multi-step training plan:" - ) - log_single_rank( - logger, - logging.INFO, - f"[Sequence Packing] - Target sequences per step: {args.global_batch_size}", + logger, logging.INFO, "[Sequence Packing] Multi-step training plan:" ) + log_single_rank( logger, logging.INFO, - f"[Sequence Packing] - Bins per rank per step: {pct_of_sequences_per_batch}*{num_bins_this_rank}={local_bins_per_step}", + f"[Sequence Packing] - Bins per rank per step: {samples_ratio_per_step}*{num_bins_this_rank}={local_bins_per_step}", ) + log_single_rank( logger, logging.INFO, - f"[Sequence Packing] - Total optimizer steps: {optimizer_steps}", + f"[Sequence Packing] - Target sequences per step: {global_batch_size}", ) log_single_rank( logger, @@ -1127,8 +1122,10 @@ def get_microbatch_dataloader(packing_context: PackingContext) -> Tuple[DataLoad f"[Sequence Packing] - Microbatches per step: {new_num_microbatches} (was {old_num_microbatches})", ) - bin_seq_indices = packing_context.packing_info.bin_seq_indices - for step in range(min(3, optimizer_steps)): + # Opt steps only depends on how much we sample and how much we consume. + # We make sure this is an integer division, check validate_args in arguments.py for details. + opt_steps = int(1 / samples_ratio_per_step) + for step in range(min(3, opt_steps)): start_idx = step * local_bins_per_step end_idx = min(start_idx + local_bins_per_step, num_bins_this_rank) step_bins = end_idx - start_idx @@ -1145,22 +1142,13 @@ def get_microbatch_dataloader(packing_context: PackingContext) -> Tuple[DataLoad f"[Sequence Packing] - Step {step + 1}: {step_bins} bins, ~{est_global_seqs} sequences globally", ) - if optimizer_steps > 3: - log_single_rank(logger, logging.INFO, f" - ... ({optimizer_steps - 3} more steps)") + if opt_steps > 3: + log_single_rank(logger, logging.INFO, f" - ... ({opt_steps - 3} more steps)") +def get_microbatch_dataloader(num_bins_this_rank, micro_batch_size): bin_indices = torch.arange(num_bins_this_rank) dataset = TensorDataset(bin_indices) - loader = DataLoader(dataset, batch_size=args.micro_batch_size, shuffle=False, collate_fn=lambda x: x[0], drop_last=True) - return loader, optimizer_steps - -def update_sequence_packing_metrics(args): - """Update bin tracking for sequence packing mode.""" - if args.rl_use_sequence_packing: - bin_count = ( - mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches() - ) - args.consumed_train_bins += bin_count - + return DataLoader(dataset, batch_size=micro_batch_size, shuffle=False, collate_fn=lambda x: x[0]) def get_sequence_packing_log_info(args): """Get logging information for sequence packing mode.""" diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 5246f44d206..05b2d702aa0 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1920,8 +1920,6 @@ def _add_rl_args(parser): help='If set, use inference logprobs in importance sampling correction of the loss.') group.add_argument('--rl-importance-sampling-truncation-coef', type=float, default=None, help="If --inference-logprobs-is-correction is on and this coefficient is set, apply truncation for the IS correction at GRPO loss.") - group.add_argument('--rl-calculate-intra-group-similarity', action=argparse.BooleanOptionalAction, default=False, - help='If set, calculate the intra-group similarity of rollouts.') group.add_argument('--rl-use-sequence-packing', action=argparse.BooleanOptionalAction, type=bool, default=False, help='Enable sequence packing') group.add_argument('--rl-sequence-packing-max-sequences-per-bin', type=int, default=50, diff --git a/megatron/training/training.py b/megatron/training/training.py index 500d30b9e73..87d9fe8b841 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -2766,8 +2766,15 @@ def get_e2e_base_metrics(): if getattr(args, 'perform_rl_step', False): with torch.no_grad(): - train_data_iterator = rl_utils.setup_grpo_data_iterator( - model, inference_model, optimizer, iteration, ref_state_dict, buffered_rollouts + train_data_iterator = rl_utils.get_grpo_data_iterator( + model, inference_model, optimizer, iteration, ref_state_dict, + grpo_iterations=args.grpo_iterations, + grpo_prompts_per_step=args.grpo_prompts_per_step, + grpo_group_size=args.grpo_group_size, + global_batch_size=args.global_batch_size, + sequence_packing=args.rl_use_sequence_packing, + buffered_rollouts=buffered_rollouts, + is_correction=args.rl_inference_logprobs_is_correction, ) # Buffered rollouts are used as a state container for setups when # we use previously-generated data for an update. @@ -2846,7 +2853,10 @@ def get_e2e_base_metrics(): if getattr(args, 'perform_rl_step', False) and args.rl_use_sequence_packing: iteration_sequences = rl_utils.get_iteration_sequence_count(args) # Track bins separately for packed mode - rl_utils.update_sequence_packing_metrics(args) + bin_count = ( + mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches() + ) + args.consumed_train_bins += bin_count else: batch_size = ( mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches() diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json index 4a8586c8e8a..42c13292446 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json @@ -4,26 +4,26 @@ "end_step": 20, "step_interval": 1, "values": { - "1": 60978671616.0, - "2": 60979740672.0, - "3": 60979740672.0, - "4": 60979740672.0, - "5": 60979998720.0, - "6": 60979732480.0, - "7": 60979994624.0, - "8": 60979728384.0, - "9": 60979986432.0, - "10": 60980248576.0, - "11": 60979982336.0, - "12": 60979720192.0, - "13": 60979716096.0, - "14": 60979716096.0, - "15": 60979716096.0, - "16": 60979716096.0, - "17": 60979712000.0, - "18": 60979970048.0, - "19": 60979716096.0, - "20": 60979716096.0 + "1": 60922068992.0, + "2": 60922068992.0, + "3": 60922331136.0, + "4": 60922073088.0, + "5": 60922331136.0, + "6": 60922064896.0, + "7": 60922331136.0, + "8": 60922064896.0, + "9": 60922322944.0, + "10": 60922052608.0, + "11": 60922056704.0, + "12": 60922318848.0, + "13": 60922056704.0, + "14": 60922318848.0, + "15": 60922056704.0, + "16": 60922310656.0, + "17": 60922052608.0, + "18": 60922052608.0, + "19": 60922048512.0, + "20": 60922044416.0 } }, "mem-max-allocated-bytes": { @@ -31,26 +31,26 @@ "end_step": 20, "step_interval": 1, "values": { - "1": 60978675712.0, - "2": 64214241280.0, - "3": 64214241280.0, - "4": 64214241280.0, - "5": 64214241280.0, - "6": 64214241280.0, - "7": 64214241280.0, - "8": 64214241280.0, - "9": 64214241280.0, - "10": 64214241280.0, - "11": 64214241280.0, - "12": 64214241280.0, - "13": 64214241280.0, - "14": 64214241280.0, - "15": 64214241280.0, - "16": 64214241280.0, - "17": 64214241280.0, - "18": 64214241280.0, - "19": 64214241280.0, - "20": 64214241280.0 + "1": 60922073088.0, + "2": 64156037120.0, + "3": 64156037120.0, + "4": 64156041216.0, + "5": 64156041216.0, + "6": 64156041216.0, + "7": 64156041216.0, + "8": 64156041216.0, + "9": 64156041216.0, + "10": 64156041216.0, + "11": 64156041216.0, + "12": 64156041216.0, + "13": 64156041216.0, + "14": 64156041216.0, + "15": 64156041216.0, + "16": 64156041216.0, + "17": 64156041216.0, + "18": 64156041216.0, + "19": 64156041216.0, + "20": 64156041216.0 } }, "iteration-time": { @@ -59,25 +59,25 @@ "step_interval": 1, "values": { "1": "nan", - "2": 37.77975, - "3": 15.85042, - "4": 14.84801, - "5": 14.16031, - "6": 14.7285, - "7": 14.32408, - "8": 14.76569, - "9": 13.73696, - "10": 14.6546, - "11": 14.12618, - "12": 14.29456, - "13": 14.27773, - "14": 14.10944, - "15": 13.7968, - "16": 13.90572, - "17": 13.58351, - "18": 14.3947, - "19": 13.78201, - "20": 13.44734 + "2": 60.37194, + "3": 13.25967, + "4": 13.01461, + "5": 14.04256, + "6": 13.53259, + "7": 13.3335, + "8": 12.72344, + "9": 13.64787, + "10": 12.66485, + "11": 13.15779, + "12": 13.01275, + "13": 12.72481, + "14": 12.67697, + "15": 12.7286, + "16": 12.65032, + "17": 12.86279, + "18": 12.71745, + "19": 13.4137, + "20": 12.75566 } } -} \ No newline at end of file +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml index 7f6fe4756e3..3037e2e0803 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml @@ -75,7 +75,7 @@ MODEL_ARGS: --timing-log-level: 1 --cuda-graph-impl: local --micro-batch-size: 1 - --global-batch-size: 2 + --global-batch-size: 4 --grpo-group-size: 2 --grpo-prompts-per-step: 2 --grpo-iterations: 1 @@ -100,4 +100,4 @@ METRICS: - "mem-max-allocated-bytes" - "iteration-time" THROUGHPUT_TEST_PARAMS: - --start_step: 10 \ No newline at end of file + --start_step: 10 diff --git a/tests/unit_tests/rl/test_rl_utils.py b/tests/unit_tests/rl/test_rl_utils.py index 8747f2e8c35..cff62d40f0e 100644 --- a/tests/unit_tests/rl/test_rl_utils.py +++ b/tests/unit_tests/rl/test_rl_utils.py @@ -79,6 +79,8 @@ def initialize_model_parallel(request, monkeypatch): Skips if world_size < tp * pp. """ monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + monkeypatch.setenv("WANDB_MODE", "disabled") + monkeypatch.setenv("LOG_TO_WANDB", "false") tp, pp = request.param world_size = Utils.world_size @@ -112,6 +114,7 @@ def create_test_args(self, **kwargs): args.hidden_size = 128 args.max_position_embeddings = 256 args.seq_length = 256 + args.wandb_project = None args.micro_batch_size = 1 @@ -274,58 +277,68 @@ def test_grpo_loss_truncation(self): def test_prepare_data_for_update(self, initialize_model_parallel): """Test that getting logprobs at least does not crash.""" world_size, dp, tp, pp = initialize_model_parallel + # Here I assume that we will be consuming all data in one step. + group_size = 2 self.create_test_args( micro_batch_size=2, seq_length=4, curr_iteration=1, tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp, + global_batch_size=dp * 2, + grpo_prompts_per_step=dp, + grpo_group_size=group_size, ) model = MockModel() tokenizer = MockTokenizer() r1 = TokenRollout( - trajectory=[1, 2, 3], + trajectory=[[1, 2, 3]], reward=3.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], + generation_mask=[[False, True, True]], + logprobs=[[0.1, 0.2, 0.3]], env_id='MEGAENV', problem_id="2", ) r2 = TokenRollout( - trajectory=[1, 2, 3, 4], + trajectory=[[1, 2, 3, 4]], reward=0.14, - generation_mask=[False, True, True, True], - logprobs=[0.1, 0.2, 0.3, -1.2], + generation_mask=[[False, True, True, True]], + logprobs=[[0.1, 0.2, 0.3, -1.2]], env_id='MEGAENV', problem_id="2", ) + rollouts = [[r1, r2] for _ in range(dp)] try: - rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) + rl_utils.prepare_data_for_update( + [model], {}, rollouts, tokenizer, sequence_packing=False, is_correction=False + ) except AssertionError as e: # We expect trajectories to come padded there. assert str(e).startswith('Rollout is not the correct length') r1 = TokenRollout( - trajectory=torch.tensor([1, 2, 3, tokenizer.eod], dtype=torch.float).cuda(), + trajectory=torch.tensor([[1, 2, 3, tokenizer.eod]], dtype=torch.float).cuda(), reward=3.14, - generation_mask=torch.tensor([False, True, True, True], dtype=torch.float).cuda(), - logprobs=torch.tensor([-0.2, -0.3, -3.2]).cuda(), + generation_mask=torch.tensor([[False, True, True, True]], dtype=torch.float).cuda(), + logprobs=torch.tensor([[-0.2, -0.3, -3.2]]).cuda(), env_id='MEGAENV', problem_id="2", ) r2 = TokenRollout( - trajectory=torch.tensor([1, 2, 234, tokenizer.eod], dtype=torch.float).cuda(), + trajectory=torch.tensor([[1, 2, 234, tokenizer.eod]], dtype=torch.float).cuda(), reward=0.14, - generation_mask=torch.tensor([False, True, True, True], dtype=torch.float).cuda(), - logprobs=torch.tensor([-0.2, -0.3, -1.2]), + generation_mask=torch.tensor([[False, True, True, True]], dtype=torch.float).cuda(), + logprobs=torch.tensor([[-0.2, -0.3, -1.2]]), env_id='MEGAENV', problem_id="2", ) rollouts = [[r1, r2] for _ in range(dp)] - data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) + data_iter = rl_utils.prepare_data_for_update( + [model], {}, rollouts, tokenizer, sequence_packing=False, is_correction=False + ) _, _, old_logprobs, _, _, _, _ = next(data_iter) # All logits are ones in the MockModel. @@ -333,7 +346,8 @@ def test_prepare_data_for_update(self, initialize_model_parallel): torch.testing.assert_close(old_logprobs.exp(), torch.ones_like(old_logprobs) / VOCAB) @pytest.mark.parametrize("use_sequence_packing", [True, False]) - def test_prepare_trajectories(self, use_sequence_packing): + @pytest.mark.parametrize("num_turns", [1, 2]) + def test_prepare_trajectories(self, use_sequence_packing, num_turns): """Test that rollouts are properly prepared for training.""" seq_length = 8 self.create_test_args( @@ -347,34 +361,38 @@ def test_prepare_trajectories(self, use_sequence_packing): # Create rollouts of varying lengths r1 = TokenRollout( - trajectory=[1, 2, 3, tokenizer.eod], + trajectory=[[1, 2, 3, tokenizer.eod]] * num_turns, reward=3.14, - generation_mask=[False, True, True, True], - logprobs=[0.1, 0.2, 0.3, 0.35], + generation_mask=[[False, True, True, True]] * num_turns, + logprobs=[[0.1, 0.2, 0.3, 0.35]] * num_turns, env_id='MEGAENV', problem_id="1", ) r2 = TokenRollout( - trajectory=[4, 5, 6, 7, tokenizer.eod], + trajectory=[[4, 5, 6, 7, tokenizer.eod]] * num_turns, reward=0.14, - generation_mask=[False, True, True, True, True], - logprobs=[0.4, 0.5, 0.6, 0.7, 0.75], + generation_mask=[[False, True, True, True, True]] * num_turns, + logprobs=[[0.4, 0.5, 0.6, 0.7, 0.75]] * num_turns, env_id='MEGAENV', problem_id="2", ) r3 = TokenRollout( - trajectory=[8, 9, tokenizer.eod], + trajectory=[[8, 9, tokenizer.eod]] * num_turns, reward=2.71, - generation_mask=[False, True, True], - logprobs=[0.8, 0.9, 0.95], + generation_mask=[[False, True, True]] * num_turns, + logprobs=[[0.8, 0.9, 0.95]] * num_turns, env_id='MEGAENV', problem_id="3", ) - rollouts = [[r1, r2, r3]] + rollouts = [r1, r2, r3] trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories( - rollouts, tokenizer, seq_length + rollouts, + tokenizer, + seq_length, + sequence_packing=use_sequence_packing, + skip_bos_token=False, ) expected_trajs = torch.tensor( @@ -385,7 +403,7 @@ def test_prepare_trajectories(self, use_sequence_packing): ], dtype=torch.long, device=trajs.device, - ) + ).repeat_interleave(num_turns, dim=0) assert torch.equal(trajs, expected_trajs) expected_genmask = torch.tensor( @@ -396,7 +414,7 @@ def test_prepare_trajectories(self, use_sequence_packing): ], dtype=torch.bool, device=genmask.device, - ) + ).repeat_interleave(num_turns, dim=0) assert torch.equal(genmask, expected_genmask) if use_sequence_packing: @@ -408,7 +426,7 @@ def test_prepare_trajectories(self, use_sequence_packing): ], dtype=torch.float32, device=inference_logprobs.device, - ) + ).repeat_interleave(num_turns, dim=0) torch.testing.assert_close(inference_logprobs, expected_logprobs, rtol=0, atol=0) else: expected_logprobs = [ @@ -416,12 +434,57 @@ def test_prepare_trajectories(self, use_sequence_packing): [0.4, 0.5, 0.6, 0.7, 0.75], [0.8, 0.9, 0.95], ] + expected_logprobs = [el for el in expected_logprobs for _ in range(num_turns)] assert len(inference_logprobs) == len(expected_logprobs) for got, exp in zip(inference_logprobs, expected_logprobs): got_t = got if torch.is_tensor(got) else torch.tensor(got, dtype=torch.float32) exp_t = torch.tensor(exp, dtype=torch.float32, device=got_t.device) torch.testing.assert_close(got_t, exp_t, rtol=0, atol=0) + def test_single_turn_advantage_calculation(self): + rewards = [[-1, 1], [4, 4]] + num_turns = [[1, 1], [1, 1]] + advs = rl_utils.calculate_grpo_advantages(rewards, num_turns) + torch.testing.assert_close( + torch.tensor(advs), torch.tensor([-1, 1.0, 0.0, 0.0]), atol=1e-4, rtol=1e-5 + ) + + def test_multi_turn_advantage_calculation(self): + rewards = [[-1, 1], [4, 4]] + num_turns = [[2, 1], [1, 3]] + advs = rl_utils.calculate_grpo_advantages(rewards, num_turns) + torch.testing.assert_close( + torch.tensor(advs), + torch.tensor([-1, -1, 1.0, 0.0, 0.0, 0.0, 0.0]), + atol=1e-4, + rtol=1e-5, + ) + + def test_pad_list_of_nones(self): + with pytest.raises(ValueError) as e_info: + rl_utils._pad_nonnull_with_zeros([None] * 3, 42) + assert "At least one" in str(e_info) + + def test_pad_with_wrong_params(self): + with pytest.raises(ValueError) as e_info: + rl_utils._pad_nonnull_with_zeros([torch.zeros(5)], 4) + assert "larger length" in str(e_info) + + def test_pad_full_size(self): + padded = rl_utils._pad_nonnull_with_zeros([torch.zeros(5), torch.zeros(5)], 5) + assert padded.shape == (2, 5) + + def test_pad_some_nones(self): + padded = rl_utils._pad_nonnull_with_zeros([None, torch.zeros(5)], 5) + assert padded.shape == (2, 5) + assert (padded[0] == 0).all() + + def test_pad_normal(self): + padded = rl_utils._pad_nonnull_with_zeros( + [torch.zeros(2), torch.zeros(3), torch.zeros(4)], 5 + ) + assert padded.shape == (3, 5) + @pytest.mark.parametrize( "initialize_model_parallel", [ diff --git a/tests/unit_tests/rl/test_sequence_packing_utils.py b/tests/unit_tests/rl/test_sequence_packing_utils.py index 548aedf55fd..44a3de762f0 100644 --- a/tests/unit_tests/rl/test_sequence_packing_utils.py +++ b/tests/unit_tests/rl/test_sequence_packing_utils.py @@ -1,5 +1,8 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from unittest.mock import patch + +import pytest import torch from megatron.rl import rl_utils, sequence_packing_utils @@ -407,3 +410,56 @@ def test_compute_packed_inference_logprobs_stats_shape_mismatch(): # Stats should remain None due to shape mismatch assert group_stats.mean_piold_to_inf_prob is None + + +@pytest.mark.parametrize( + "ratio,local_bins,world,expected_bs", + [ + (1.0, 1, 8, 8), # no stale data (ratio 1.), everything divides perfectly. + (1.0, 42, 8, 42 * 8), # no stale data (ratio 1.), everything divides perfectly, more bins + ( + 0.5, + 1, + 8, + 8, + ), # 0.5 means we use half of all seqs per step, they all fit 1 bin -> we should reuse + (1 / 3, 4, 8, 16), # third of the data per step, nonint division + ], +) +def test_get_bins_bs_and_steps(ratio, local_bins, world, expected_bs): + # Make a dummy struct to check only the required fields. + # Divide by ratio to make sure the samples are divisible by global_bs in the test. + n_seqs = int(world * 7 / ratio) + global_bs_in_seq = int(n_seqs * ratio) + + def side_eff( + rank, + rampup_batch_size, + global_batch_size, + micro_batch_size, + data_parallel_size, + decrease_batch_size_if_needed, + ): + # Inside of the get_microbatch_dataloader, we compute the batch size in bins. + # We want to test this variable. + global actual_bs + actual_bs = global_batch_size + + with patch('megatron.rl.sequence_packing_utils.get_num_microbatches', return_value=1): + with patch( + 'megatron.rl.sequence_packing_utils.reconfigure_num_microbatches_calculator', + side_effect=side_eff, + ): + with patch('megatron.core.mpu.get_data_parallel_world_size', return_value=world): + sequence_packing_utils.update_microbatch_calculator( + samples_ratio_per_step=ratio, + num_bins_this_rank=local_bins, + bin_seq_indices=[], + global_batch_size=global_bs_in_seq, + rampup_batch_size=1, + micro_batch_size=1, + decrease_batch_size_if_needed=False, + ) + + # Iterator is local, batch size is global + assert expected_bs == actual_bs diff --git a/train_rl.py b/train_rl.py index 299843bcff3..cfc010b3c04 100644 --- a/train_rl.py +++ b/train_rl.py @@ -260,6 +260,7 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False): if packed_seq_params is None: packed_seq_params = get_default_packed_seq_params( seq_length=tokens.shape[1], + max_sequences_per_bin=args.rl_sequence_packing_max_sequences_per_bin, device=tokens.device, ) From b168849a8c2f3eb0efea9255fbe04db8858826fc Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 30 Jan 2026 10:13:56 -0600 Subject: [PATCH 006/231] ci: Disable the api check for now (#3157) Signed-off-by: Charlie Truong --- .github/CODEOWNERS | 10 +- ...k_api_backwards_compatibility_workflow.yml | 526 +++++++++--------- 2 files changed, 269 insertions(+), 267 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e319d63f00c..7496893749c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -47,11 +47,11 @@ tests/test_utils/recipes/ @NVIDIA/ci tests/unit_tests/run_ci_test.sh @NVIDIA/ci # API Backwards Compatibility Check -scripts/check_api_backwards_compatibility.py @NVIDIA/ci @pablo-garay -scripts/README_API_COMPAT.md @NVIDIA/ci @pablo-garay -.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci @pablo-garay -docs/api-backwards-compatibility-check.md @NVIDIA/ci @pablo-garay -tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci @pablo-garay +scripts/check_api_backwards_compatibility.py @NVIDIA/ci +scripts/README_API_COMPAT.md @NVIDIA/ci +.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci +docs/api-backwards-compatibility-check.md @NVIDIA/ci +tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci megatron/rl/ @NVIDIA/reinforcement-learning examples/rl/ @NVIDIA/reinforcement-learning diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index f4fcd4c3713..44340bdedc5 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -1,274 +1,276 @@ -name: API Compatibility Check +# Temporarily disable this check until we can enforce it on PRs +# +# name: API Compatibility Check -on: - push: - branches: - - dev - - main - - 'pull-request/[0-9]+' - - 'deploy-release/*' - merge_group: - types: [checks_requested] +# on: +# push: +# branches: +# - dev +# - main +# - 'pull-request/[0-9]+' +# - 'deploy-release/*' +# merge_group: +# types: [checks_requested] - # Allow manual trigger - workflow_dispatch: - inputs: - baseline: - description: 'Baseline git reference (tag/branch/commit)' - required: true +# # Allow manual trigger +# workflow_dispatch: +# inputs: +# baseline: +# description: 'Baseline git reference (tag/branch/commit)' +# required: true -jobs: - pre-flight: - name: Pre-flight check - runs-on: ubuntu-latest - outputs: - should_skip: ${{ steps.check_files.outputs.should_skip }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Check if relevant files changed - id: check_files - run: | - # For manual triggers, never skip - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - echo "should_skip=false" >> $GITHUB_OUTPUT - echo "Manual trigger - will run compatibility check" - exit 0 - fi +# jobs: +# pre-flight: +# name: Pre-flight check +# runs-on: ubuntu-latest +# outputs: +# should_skip: ${{ steps.check_files.outputs.should_skip }} +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 0 - # Determine base SHA based on event type - if [ "${{ github.event_name }}" == "merge_group" ]; then - BASE_SHA="${{ github.event.merge_group.base_sha }}" - echo "Merge group event - comparing against base: $BASE_SHA" - else - # For push events, use merge-base to find common ancestor - # This ensures we only detect changes actually made in this PR branch, - # not changes that happened in main after the branch was created - BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "") - if [ -z "$BASE_SHA" ]; then - # Fallback for pull-request/* branches targeting dev - BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "") - fi - echo "Push event - comparing against merge-base: $BASE_SHA" - fi +# - name: Check if relevant files changed +# id: check_files +# run: | +# # For manual triggers, never skip +# if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then +# echo "should_skip=false" >> $GITHUB_OUTPUT +# echo "Manual trigger - will run compatibility check" +# exit 0 +# fi - if [ -z "$BASE_SHA" ]; then - echo "Could not determine base SHA - will run compatibility check" - echo "should_skip=false" >> $GITHUB_OUTPUT - exit 0 - fi +# # Determine base SHA based on event type +# if [ "${{ github.event_name }}" == "merge_group" ]; then +# BASE_SHA="${{ github.event.merge_group.base_sha }}" +# echo "Merge group event - comparing against base: $BASE_SHA" +# else +# # For push events, use merge-base to find common ancestor +# # This ensures we only detect changes actually made in this PR branch, +# # not changes that happened in main after the branch was created +# BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "") +# if [ -z "$BASE_SHA" ]; then +# # Fallback for pull-request/* branches targeting dev +# BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "") +# fi +# echo "Push event - comparing against merge-base: $BASE_SHA" +# fi - # Check for changes in megatron/core Python files (excluding tests and legacy) - # Note: Using both *.py and **/*.py to match files at root and in subdirectories - CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ - 'megatron/core/*.py' \ - 'megatron/core/**/*.py' \ - ':!megatron/core/tests/**' \ - ':!megatron/legacy/**' 2>/dev/null || echo "") +# if [ -z "$BASE_SHA" ]; then +# echo "Could not determine base SHA - will run compatibility check" +# echo "should_skip=false" >> $GITHUB_OUTPUT +# exit 0 +# fi - if [ -z "$CHANGED_FILES" ]; then - echo "should_skip=true" >> $GITHUB_OUTPUT - echo "No relevant megatron/core files changed - will skip compatibility check" - else - echo "should_skip=false" >> $GITHUB_OUTPUT - echo "Relevant files changed:" - echo "$CHANGED_FILES" - fi +# # Check for changes in megatron/core Python files (excluding tests and legacy) +# # Note: Using both *.py and **/*.py to match files at root and in subdirectories +# CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ +# 'megatron/core/*.py' \ +# 'megatron/core/**/*.py' \ +# ':!megatron/core/tests/**' \ +# ':!megatron/legacy/**' 2>/dev/null || echo "") - check-compatibility: - needs: [pre-flight] - if: needs.pre-flight.outputs.should_skip != 'true' - name: "OPTIONAL: Check API Backward Compatibility" - runs-on: ubuntu-latest - - # ============================================================================ - # Configuration Parameters (modify here) - # ============================================================================ - env: - # Default baseline for automatic PR checks - # Can be: branch name (e.g., 'main'), commit hash, or tag - # Will be resolved to commit hash during execution - DEFAULT_BASELINE: '5ab481cb45efc72add12f8ba0378e849b3d2bc50' - # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') - TAG_PATTERN: 'core_v*' - # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) - TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$' - # ============================================================================ - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Need full history to access baseline ref - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install griffe - run: | - python -m pip install --upgrade pip - python -m pip install griffe - python -c "import griffe; print('Griffe installed successfully')" - python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed" - python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed" - - - name: Determine baseline reference - id: baseline - run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - # Use manually specified baseline (branch, tag, or commit hash) - BASELINE_REF="${{ github.event.inputs.baseline }}" - else - # Use the configured default baseline - BASELINE_REF="${{ env.DEFAULT_BASELINE }}" - - # Uncomment below to auto-detect from tags instead: - # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1) - # if [ -z "$BASELINE_REF" ]; then - # echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2 - # BASELINE_REF="${{ env.DEFAULT_BASELINE }}" - # fi - fi - - # Resolve baseline to commit hash (works for branches, tags, or commit hashes) - BASELINE_HASH=$(git rev-parse "$BASELINE_REF") - - echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT - echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)" - - - name: Run compatibility check - id: compat_check - run: | - # Save output to file for later display - python scripts/check_api_backwards_compatibility.py \ - --baseline ${{ steps.baseline.outputs.baseline }} \ - --verbose 2>&1 | tee compat_check_output.txt - - # Capture exit code - EXIT_CODE=${PIPESTATUS[0]} - echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT - exit $EXIT_CODE - continue-on-error: true - - - name: Fail job if breaking changes detected - if: steps.compat_check.outcome == 'failure' - run: | - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "🔍 WHAT IS THIS CHECK?" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "This check ensures that changes to Megatron Core's public API do not" - echo "break backward compatibility for users. It compares your PR against" - echo "the latest stable release to detect breaking changes in:" - echo "" - echo " • Function signatures (parameters, order, types)" - echo " • Class structures and methods" - echo " • Return types and public interfaces" - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "🛠️ HOW TO FIX THIS" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "Choose ONE of these resolution strategies:" - echo "" - echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)" - echo " → Modify your code to preserve backward compatibility" - echo " → Add new parameters as optional (with defaults)" - echo " → Keep existing parameters in the same order" - echo "" - echo "2️⃣ MARK AS INTERNAL API (If this is internal code)" - echo " → Add @internal_api decorator from megatron.core.utils" - echo "" - echo " Example (for classes):" - echo " from megatron.core.utils import internal_api" - echo "" - echo " @internal_api" - echo " class ExperimentalFeature:" - echo " pass" - echo "" - echo " Example (for functions):" - echo " from megatron.core.utils import internal_api" - echo "" - echo " @internal_api" - echo " def internal_helper_function():" - echo " pass" - echo "" - echo "3️⃣ MARK AS EXPERIMENTAL API (If this is experimental code)" - echo " → Add @experimental_api decorator from megatron.core.utils" - echo "" - echo " Example:" - echo " from megatron.core.utils import experimental_api" - echo "" - echo " @experimental_api" - echo " class ExperimentalFeature:" - echo " pass" - echo "" - echo "4️⃣ USE DEPRECATION (For gradual API changes)" - echo " → Add @deprecated decorator for transition period" - echo " → Example:" - echo " from megatron.core.utils import deprecated" - echo "" - echo " @deprecated(version='1.0', removal_version='2.0'," - echo " alternative='new_function')" - echo " def old_function():" - echo " pass" - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "📋 BREAKING CHANGES DETECTED" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - cat compat_check_output.txt - echo "" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "📚 MORE INFORMATION" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "" - echo "📖 Full documentation: docs/api-backwards-compatibility-check.md" - echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py" - echo "❓ Questions? Check the docs or ask in #megatron-core" - echo "" - - echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy." - exit 1 - - - name: Success message - if: steps.compat_check.outcome == 'success' - run: | - echo "::notice::✅ No breaking API changes detected!" +# if [ -z "$CHANGED_FILES" ]; then +# echo "should_skip=true" >> $GITHUB_OUTPUT +# echo "No relevant megatron/core files changed - will skip compatibility check" +# else +# echo "should_skip=false" >> $GITHUB_OUTPUT +# echo "Relevant files changed:" +# echo "$CHANGED_FILES" +# fi - api-backward-compatibility-summary: - needs: [pre-flight, check-compatibility] - runs-on: ubuntu-latest - name: "OPTIONAL: API Backward Compatibility Check Summary" - if: always() && !cancelled() - steps: - - name: Checkout - uses: actions/checkout@v4 +# check-compatibility: +# needs: [pre-flight] +# if: needs.pre-flight.outputs.should_skip != 'true' +# name: "OPTIONAL: Check API Backward Compatibility" +# runs-on: ubuntu-latest - - name: Validate workflow result - shell: bash -x -e -u -o pipefail {0} - env: - GH_TOKEN: ${{ github.token }} - SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }} - run: | - FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0 +# # ============================================================================ +# # Configuration Parameters (modify here) +# # ============================================================================ +# env: +# # Default baseline for automatic PR checks +# # Can be: branch name (e.g., 'main'), commit hash, or tag +# # Will be resolved to commit hash during execution +# DEFAULT_BASELINE: '5ab481cb45efc72add12f8ba0378e849b3d2bc50' +# # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') +# TAG_PATTERN: 'core_v*' +# # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) +# TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$' +# # ============================================================================ - if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then - if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then - echo "✅ Compatibility check was skipped (no relevant files changed)" - else - echo "✅ All checks passed successfully" - fi - exit 0 - else - echo "❌ Found $FAILED_JOBS failed job(s)" - gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name' - exit 1 - fi +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 0 # Need full history to access baseline ref + +# - name: Set up Python +# uses: actions/setup-python@v5 +# with: +# python-version: '3.12' + +# - name: Install griffe +# run: | +# python -m pip install --upgrade pip +# python -m pip install griffe +# python -c "import griffe; print('Griffe installed successfully')" +# python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed" +# python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed" + +# - name: Determine baseline reference +# id: baseline +# run: | +# if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then +# # Use manually specified baseline (branch, tag, or commit hash) +# BASELINE_REF="${{ github.event.inputs.baseline }}" +# else +# # Use the configured default baseline +# BASELINE_REF="${{ env.DEFAULT_BASELINE }}" + +# # Uncomment below to auto-detect from tags instead: +# # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1) +# # if [ -z "$BASELINE_REF" ]; then +# # echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2 +# # BASELINE_REF="${{ env.DEFAULT_BASELINE }}" +# # fi +# fi + +# # Resolve baseline to commit hash (works for branches, tags, or commit hashes) +# BASELINE_HASH=$(git rev-parse "$BASELINE_REF") + +# echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT +# echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)" + +# - name: Run compatibility check +# id: compat_check +# run: | +# # Save output to file for later display +# python scripts/check_api_backwards_compatibility.py \ +# --baseline ${{ steps.baseline.outputs.baseline }} \ +# --verbose 2>&1 | tee compat_check_output.txt + +# # Capture exit code +# EXIT_CODE=${PIPESTATUS[0]} +# echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT +# exit $EXIT_CODE +# continue-on-error: true + +# - name: Fail job if breaking changes detected +# if: steps.compat_check.outcome == 'failure' +# run: | +# echo "" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "🔍 WHAT IS THIS CHECK?" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "" +# echo "This check ensures that changes to Megatron Core's public API do not" +# echo "break backward compatibility for users. It compares your PR against" +# echo "the latest stable release to detect breaking changes in:" +# echo "" +# echo " • Function signatures (parameters, order, types)" +# echo " • Class structures and methods" +# echo " • Return types and public interfaces" +# echo "" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "🛠️ HOW TO FIX THIS" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "" +# echo "Choose ONE of these resolution strategies:" +# echo "" +# echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)" +# echo " → Modify your code to preserve backward compatibility" +# echo " → Add new parameters as optional (with defaults)" +# echo " → Keep existing parameters in the same order" +# echo "" +# echo "2️⃣ MARK AS INTERNAL API (If this is internal code)" +# echo " → Add @internal_api decorator from megatron.core.utils" +# echo "" +# echo " Example (for classes):" +# echo " from megatron.core.utils import internal_api" +# echo "" +# echo " @internal_api" +# echo " class ExperimentalFeature:" +# echo " pass" +# echo "" +# echo " Example (for functions):" +# echo " from megatron.core.utils import internal_api" +# echo "" +# echo " @internal_api" +# echo " def internal_helper_function():" +# echo " pass" +# echo "" +# echo "3️⃣ MARK AS EXPERIMENTAL API (If this is experimental code)" +# echo " → Add @experimental_api decorator from megatron.core.utils" +# echo "" +# echo " Example:" +# echo " from megatron.core.utils import experimental_api" +# echo "" +# echo " @experimental_api" +# echo " class ExperimentalFeature:" +# echo " pass" +# echo "" +# echo "4️⃣ USE DEPRECATION (For gradual API changes)" +# echo " → Add @deprecated decorator for transition period" +# echo " → Example:" +# echo " from megatron.core.utils import deprecated" +# echo "" +# echo " @deprecated(version='1.0', removal_version='2.0'," +# echo " alternative='new_function')" +# echo " def old_function():" +# echo " pass" +# echo "" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "📋 BREAKING CHANGES DETECTED" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "" +# cat compat_check_output.txt +# echo "" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "📚 MORE INFORMATION" +# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +# echo "" +# echo "📖 Full documentation: docs/api-backwards-compatibility-check.md" +# echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py" +# echo "❓ Questions? Check the docs or ask in #megatron-core" +# echo "" + +# echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy." +# exit 1 + +# - name: Success message +# if: steps.compat_check.outcome == 'success' +# run: | +# echo "::notice::✅ No breaking API changes detected!" + +# api-backward-compatibility-summary: +# needs: [pre-flight, check-compatibility] +# runs-on: ubuntu-latest +# name: "OPTIONAL: API Backward Compatibility Check Summary" +# if: always() && !cancelled() +# steps: +# - name: Checkout +# uses: actions/checkout@v4 + +# - name: Validate workflow result +# shell: bash -x -e -u -o pipefail {0} +# env: +# GH_TOKEN: ${{ github.token }} +# SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }} +# run: | +# FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0 + +# if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then +# if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then +# echo "✅ Compatibility check was skipped (no relevant files changed)" +# else +# echo "✅ All checks passed successfully" +# fi +# exit 0 +# else +# echo "❌ Found $FAILED_JOBS failed job(s)" +# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name' +# exit 1 +# fi From a205538fdd30c5164ed8c11f0745cb8fb083e6a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 30 Jan 2026 19:40:48 +0100 Subject: [PATCH 007/231] ci: Add DSv3 proxy (#3169) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../bert/bert_release/model_config.yaml | 1 + .../gpt/gpt3_15b_8t_release/model_config.yaml | 1 + .../gpt3_15b_8t_release_sm/model_config.yaml | 1 + .../golden_values_dev_dgx_h100.json | 11492 ++++++++++++++++ .../model_config.yml | 169 + .../model_config.yml | 11 +- .../model_config.yaml | 168 + .../model_config.yaml | 19 +- .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../t5/t5_release/model_config.yaml | 1 + 13 files changed, 11852 insertions(+), 15 deletions(-) create mode 100644 tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml create mode 100644 tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml index ab5558fa7d2..af341b0f670 100644 --- a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml @@ -45,6 +45,7 @@ MODEL_ARGS: --log-params-norm: true --log-validation-ppl-to-tensorboard: true --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --attention-backend: unfused --exit-interval: 20000 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml index 44f9de33775..692e3882e02 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml @@ -84,6 +84,7 @@ MODEL_ARGS: --log-interval: 100 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml index 8b437ba75e7..87a4fccb347 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml @@ -83,6 +83,7 @@ MODEL_ARGS: --log-interval: 100 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..f486950e5a2 --- /dev/null +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json @@ -0,0 +1,11492 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 13.89756, + "5": 13.89155, + "10": 13.85814, + "15": 13.84947, + "20": 13.74128, + "25": 13.71269, + "30": 13.39136, + "35": 13.32418, + "40": 13.23329, + "45": 13.12045, + "50": 12.53632, + "55": 12.35058, + "60": 12.17187, + "65": 12.01029, + "70": 11.83519, + "75": 11.55823, + "80": 11.30557, + "85": 11.11711, + "90": 10.96045, + "95": 10.79835, + "100": 10.58719, + "105": 10.45871, + "110": 10.23985, + "115": 10.03197, + "120": 9.88087, + "125": 9.74001, + "130": 9.64895, + "135": 9.58316, + "140": 9.34895, + "145": 9.3363, + "150": 9.17736, + "155": 9.11162, + "160": 9.02957, + "165": 8.91504, + "170": 8.86399, + "175": 8.82531, + "180": 8.68067, + "185": 8.72019, + "190": 8.59287, + "195": 8.59803, + "200": 8.48665, + "205": 8.39681, + "210": 8.35424, + "215": 8.40636, + "220": 8.27837, + "225": 8.29496, + "230": 8.27773, + "235": 8.20463, + "240": 8.15385, + "245": 8.1344, + "250": 8.06891, + "255": 8.08354, + "260": 7.97761, + "265": 7.96264, + "270": 7.91745, + "275": 7.9055, + "280": 7.89502, + "285": 7.91233, + "290": 7.858, + "295": 7.84326, + "300": 7.73922, + "305": 7.73479, + "310": 7.6998, + "315": 7.6959, + "320": 7.68835, + "325": 7.60857, + "330": 7.59888, + "335": 7.57833, + "340": 7.62257, + "345": 7.51187, + "350": 7.5063, + "355": 7.43406, + "360": 7.53414, + "365": 7.45759, + "370": 7.49186, + "375": 7.43607, + "380": 7.41292, + "385": 7.41117, + "390": 7.42986, + "395": 7.36781, + "400": 7.30747, + "405": 7.31834, + "410": 7.30943, + "415": 7.29421, + "420": 7.2965, + "425": 7.26158, + "430": 7.20979, + "435": 7.22197, + "440": 7.18512, + "445": 7.1687, + "450": 7.12181, + "455": 7.14062, + "460": 7.11041, + "465": 7.10497, + "470": 7.07645, + "475": 7.09742, + "480": 6.97587, + "485": 7.03312, + "490": 6.99478, + "495": 6.9692, + "500": 6.91435, + "505": 6.94713, + "510": 6.92309, + "515": 6.88853, + "520": 6.88024, + "525": 6.87529, + "530": 6.88311, + "535": 6.8642, + "540": 6.78769, + "545": 6.8252, + "550": 6.84568, + "555": 6.86869, + "560": 6.81372, + "565": 6.74969, + "570": 6.76579, + "575": 6.77872, + "580": 6.69766, + "585": 6.71359, + "590": 6.65449, + "595": 6.64792, + "600": 6.67016, + "605": 6.65924, + "610": 6.63641, + "615": 6.68438, + "620": 6.60355, + "625": 6.57203, + "630": 6.56964, + "635": 6.60732, + "640": 6.59738, + "645": 6.5815, + "650": 6.62582, + "655": 6.62475, + "660": 6.53171, + "665": 6.52224, + "670": 6.47146, + "675": 6.57058, + "680": 6.53989, + "685": 6.49695, + "690": 6.47037, + "695": 6.43685, + "700": 6.43121, + "705": 6.4313, + "710": 6.46058, + "715": 6.46842, + "720": 6.35254, + "725": 6.40344, + "730": 6.39123, + "735": 6.41174, + "740": 6.34886, + "745": 6.31567, + "750": 6.37227, + "755": 6.29068, + "760": 6.30783, + "765": 6.32016, + "770": 6.31539, + "775": 6.3051, + "780": 6.27484, + "785": 6.28635, + "790": 6.25066, + "795": 6.24498, + "800": 6.22595, + "805": 6.30241, + "810": 6.16125, + "815": 6.18921, + "820": 6.19984, + "825": 6.20878, + "830": 6.21184, + "835": 6.16547, + "840": 6.13918, + "845": 6.18907, + "850": 6.14544, + "855": 6.14245, + "860": 6.12573, + "865": 6.14471, + "870": 6.103, + "875": 6.14755, + "880": 6.09503, + "885": 6.08625, + "890": 6.14906, + "895": 6.03612, + "900": 6.06033, + "905": 6.07119, + "910": 6.04765, + "915": 6.02795, + "920": 6.01922, + "925": 6.00762, + "930": 6.04202, + "935": 6.03448, + "940": 5.96552, + "945": 6.00691, + "950": 6.02802, + "955": 5.9757, + "960": 5.9732, + "965": 5.8947, + "970": 5.93848, + "975": 5.94046, + "980": 5.91694, + "985": 5.91057, + "990": 5.96163, + "995": 5.87028, + "1000": 5.89819, + "1005": 5.85552, + "1010": 5.89001, + "1015": 5.91011, + "1020": 5.82121, + "1025": 5.81525, + "1030": 5.82852, + "1035": 5.91121, + "1040": 5.83477, + "1045": 5.80641, + "1050": 5.84029, + "1055": 5.82471, + "1060": 5.77657, + "1065": 5.75965, + "1070": 5.80228, + "1075": 5.78852, + "1080": 5.77993, + "1085": 5.79347, + "1090": 5.7642, + "1095": 5.77727, + "1100": 5.73679, + "1105": 5.71252, + "1110": 5.76864, + "1115": 5.69994, + "1120": 5.64073, + "1125": 5.65212, + "1130": 5.71653, + "1135": 5.67194, + "1140": 5.66144, + "1145": 5.65572, + "1150": 5.68319, + "1155": 5.64543, + "1160": 5.63371, + "1165": 5.67226, + "1170": 5.65589, + "1175": 5.62136, + "1180": 5.63006, + "1185": 5.6181, + "1190": 5.60413, + "1195": 5.59825, + "1200": 5.54202, + "1205": 5.65572, + "1210": 5.51312, + "1215": 5.55359, + "1220": 5.63431, + "1225": 5.51403, + "1230": 5.56754, + "1235": 5.521, + "1240": 5.55808, + "1245": 5.52886, + "1250": 5.51046, + "1255": 5.50279, + "1260": 5.50208, + "1265": 5.47964, + "1270": 5.44537, + "1275": 5.52448, + "1280": 5.45447, + "1285": 5.4682, + "1290": 5.43648, + "1295": 5.46181, + "1300": 5.46016, + "1305": 5.43278, + "1310": 5.38271, + "1315": 5.44073, + "1320": 5.42393, + "1325": 5.3568, + "1330": 5.41966, + "1335": 5.39498, + "1340": 5.44678, + "1345": 5.4046, + "1350": 5.3745, + "1355": 5.36722, + "1360": 5.37555, + "1365": 5.38819, + "1370": 5.31687, + "1375": 5.3257, + "1380": 5.37435, + "1385": 5.33822, + "1390": 5.32907, + "1395": 5.35996, + "1400": 5.34708, + "1405": 5.32768, + "1410": 5.30321, + "1415": 5.26874, + "1420": 5.31115, + "1425": 5.3045, + "1430": 5.33954, + "1435": 5.24914, + "1440": 5.27894, + "1445": 5.31118, + "1450": 5.28087, + "1455": 5.30455, + "1460": 5.26455, + "1465": 5.26355, + "1470": 5.29615, + "1475": 5.27116, + "1480": 5.26692, + "1485": 5.21939, + "1490": 5.21283, + "1495": 5.23155, + "1500": 5.23275, + "1505": 5.20436, + "1510": 5.22447, + "1515": 5.15502, + "1520": 5.1852, + "1525": 5.15413, + "1530": 5.17452, + "1535": 5.16098, + "1540": 5.16276, + "1545": 5.19593, + "1550": 5.1989, + "1555": 5.18478, + "1560": 5.1253, + "1565": 5.15973, + "1570": 5.17281, + "1575": 5.1468, + "1580": 5.16002, + "1585": 5.14495, + "1590": 5.12815, + "1595": 5.09691, + "1600": 5.17173, + "1605": 5.09626, + "1610": 5.10506, + "1615": 5.09978, + "1620": 5.1145, + "1625": 5.10983, + "1630": 5.08211, + "1635": 5.12902, + "1640": 5.09565, + "1645": 5.08916, + "1650": 5.08067, + "1655": 5.06625, + "1660": 5.05546, + "1665": 5.04609, + "1670": 5.06711, + "1675": 5.06871, + "1680": 5.00775, + "1685": 5.01672, + "1690": 4.99799, + "1695": 5.00065, + "1700": 5.03983, + "1705": 5.01824, + "1710": 5.00629, + "1715": 4.97587, + "1720": 4.97437, + "1725": 4.9984, + "1730": 4.95014, + "1735": 5.02541, + "1740": 4.95266, + "1745": 4.97461, + "1750": 4.95639, + "1755": 4.97133, + "1760": 4.98489, + "1765": 4.93728, + "1770": 4.93343, + "1775": 4.9432, + "1780": 4.96314, + "1785": 4.91574, + "1790": 4.93944, + "1795": 4.93848, + "1800": 4.88725, + "1805": 4.87771, + "1810": 4.8976, + "1815": 4.89801, + "1820": 4.8872, + "1825": 4.89371, + "1830": 4.8786, + "1835": 4.87542, + "1840": 4.87209, + "1845": 4.85811, + "1850": 4.83484, + "1855": 4.89133, + "1860": 4.84322, + "1865": 4.85108, + "1870": 4.82648, + "1875": 4.83877, + "1880": 4.89485, + "1885": 4.84392, + "1890": 4.8281, + "1895": 4.77339, + "1900": 4.81423, + "1905": 4.81232, + "1910": 4.82991, + "1915": 4.79768, + "1920": 4.78308, + "1925": 4.79277, + "1930": 4.76544, + "1935": 4.7941, + "1940": 4.75875, + "1945": 4.80214, + "1950": 4.83843, + "1955": 4.77731, + "1960": 4.76768, + "1965": 4.72596, + "1970": 4.73388, + "1975": 4.7973, + "1980": 4.73036, + "1985": 4.74162, + "1990": 4.78353, + "1995": 4.74959, + "2000": 4.76948, + "2005": 4.80113, + "2010": 4.70951, + "2015": 4.69715, + "2020": 4.71284, + "2025": 4.75821, + "2030": 4.68831, + "2035": 4.71528, + "2040": 4.67772, + "2045": 4.76255, + "2050": 4.74404, + "2055": 4.7077, + "2060": 4.70614, + "2065": 4.66526, + "2070": 4.67653, + "2075": 4.69507, + "2080": 4.66174, + "2085": 4.69911, + "2090": 4.61739, + "2095": 4.64746, + "2100": 4.61666, + "2105": 4.64633, + "2110": 4.64123, + "2115": 4.65336, + "2120": 4.64559, + "2125": 4.61059, + "2130": 4.61466, + "2135": 4.62745, + "2140": 4.6232, + "2145": 4.58124, + "2150": 4.60983, + "2155": 4.57956, + "2160": 4.60382, + "2165": 4.58415, + "2170": 4.61387, + "2175": 4.60275, + "2180": 4.59531, + "2185": 4.60788, + "2190": 4.58246, + "2195": 4.55672, + "2200": 4.55346, + "2205": 4.56383, + "2210": 4.6146, + "2215": 4.64276, + "2220": 4.59912, + "2225": 4.57263, + "2230": 4.56854, + "2235": 4.61797, + "2240": 4.51401, + "2245": 4.5176, + "2250": 4.52905, + "2255": 4.54117, + "2260": 4.48536, + "2265": 4.56489, + "2270": 4.49655, + "2275": 4.55547, + "2280": 4.51075, + "2285": 4.53333, + "2290": 4.52269, + "2295": 4.52707, + "2300": 4.53228, + "2305": 4.49287, + "2310": 4.53148, + "2315": 4.46329, + "2320": 4.51121, + "2325": 4.49336, + "2330": 4.49351, + "2335": 4.47787, + "2340": 4.48626, + "2345": 4.52525, + "2350": 4.4674, + "2355": 4.47173, + "2360": 4.44099, + "2365": 4.44682, + "2370": 4.44716, + "2375": 4.44199, + "2380": 4.39487, + "2385": 4.43475, + "2390": 4.43071, + "2395": 4.46719, + "2400": 4.42074, + "2405": 4.40081, + "2410": 4.44955, + "2415": 4.42055, + "2420": 4.4293, + "2425": 4.39783, + "2430": 4.42084, + "2435": 4.40291, + "2440": 4.39501, + "2445": 4.40808, + "2450": 4.38239, + "2455": 4.4178, + "2460": 4.36606, + "2465": 4.41327, + "2470": 4.40023, + "2475": 4.41776, + "2480": 4.34092, + "2485": 4.37423, + "2490": 4.37838, + "2495": 4.35662, + "2500": 4.36528, + "2505": 4.37219, + "2510": 4.41251, + "2515": 4.40356, + "2520": 4.34516, + "2525": 4.36214, + "2530": 4.36786, + "2535": 4.36686, + "2540": 4.36548, + "2545": 4.37687, + "2550": 4.30337, + "2555": 4.37244, + "2560": 4.35158, + "2565": 4.30393, + "2570": 4.33393, + "2575": 4.30697, + "2580": 4.30582, + "2585": 4.29358, + "2590": 4.31272, + "2595": 4.28154, + "2600": 4.29867, + "2605": 4.31115, + "2610": 4.32106, + "2615": 4.27768, + "2620": 4.26935, + "2625": 4.30437, + "2630": 4.22434, + "2635": 4.30369, + "2640": 4.30012, + "2645": 4.2581, + "2650": 4.28639, + "2655": 4.26647, + "2660": 4.21474, + "2665": 4.30436, + "2670": 4.26382, + "2675": 4.2306, + "2680": 4.25227, + "2685": 4.25736, + "2690": 4.22986, + "2695": 4.28379, + "2700": 4.19098, + "2705": 4.23853, + "2710": 4.25092, + "2715": 4.23481, + "2720": 4.24356, + "2725": 4.2225, + "2730": 4.22941, + "2735": 4.22363, + "2740": 4.20346, + "2745": 4.18765, + "2750": 4.21101, + "2755": 4.22237, + "2760": 4.22902, + "2765": 4.18298, + "2770": 4.23755, + "2775": 4.17706, + "2780": 4.21186, + "2785": 4.19469, + "2790": 4.21736, + "2795": 4.18988, + "2800": 4.1159, + "2805": 4.16613, + "2810": 4.17076, + "2815": 4.15389, + "2820": 4.1969, + "2825": 4.19241, + "2830": 4.16864, + "2835": 4.17046, + "2840": 4.16148, + "2845": 4.14967, + "2850": 4.16619, + "2855": 4.11805, + "2860": 4.14572, + "2865": 4.17023, + "2870": 4.14096, + "2875": 4.1596, + "2880": 4.08582, + "2885": 4.14242, + "2890": 4.11503, + "2895": 4.15452, + "2900": 4.09735, + "2905": 4.11101, + "2910": 4.10798, + "2915": 4.14914, + "2920": 4.12546, + "2925": 4.10099, + "2930": 4.08522, + "2935": 4.07896, + "2940": 4.09225, + "2945": 4.06113, + "2950": 4.03479, + "2955": 4.03763, + "2960": 4.04955, + "2965": 4.0643, + "2970": 4.08593, + "2975": 4.0941, + "2980": 4.03102, + "2985": 4.07394, + "2990": 4.08923, + "2995": 4.03231, + "3000": 4.0436, + "3005": 4.02568, + "3010": 4.06747, + "3015": 4.02305, + "3020": 4.03992, + "3025": 4.02491, + "3030": 4.0567, + "3035": 4.04059, + "3040": 4.0544, + "3045": 4.04677, + "3050": 4.017, + "3055": 4.00507, + "3060": 3.9904, + "3065": 4.02281, + "3070": 4.03826, + "3075": 3.97211, + "3080": 4.0011, + "3085": 4.00548, + "3090": 4.00887, + "3095": 4.02745, + "3100": 4.01465, + "3105": 3.99035, + "3110": 3.99124, + "3115": 3.92509, + "3120": 4.00505, + "3125": 3.94183, + "3130": 3.96987, + "3135": 3.96132, + "3140": 3.95209, + "3145": 3.93524, + "3150": 3.96949, + "3155": 3.96213, + "3160": 3.96255, + "3165": 3.96146, + "3170": 3.96456, + "3175": 3.93165, + "3180": 3.93784, + "3185": 3.90234, + "3190": 3.92455, + "3195": 3.9116, + "3200": 3.89013, + "3205": 3.92029, + "3210": 3.89711, + "3215": 3.90569, + "3220": 3.89706, + "3225": 3.91097, + "3230": 3.89895, + "3235": 3.91122, + "3240": 3.88912, + "3245": 3.88902, + "3250": 3.84407, + "3255": 3.89259, + "3260": 3.88283, + "3265": 3.92603, + "3270": 3.9052, + "3275": 3.85915, + "3280": 3.88232, + "3285": 3.86652, + "3290": 3.86681, + "3295": 3.83806, + "3300": 3.85349, + "3305": 3.86048, + "3310": 3.85872, + "3315": 3.89673, + "3320": 3.85179, + "3325": 3.84353, + "3330": 3.82539, + "3335": 3.86213, + "3340": 3.81824, + "3345": 3.83129, + "3350": 3.85901, + "3355": 3.8452, + "3360": 3.83241, + "3365": 3.83682, + "3370": 3.82265, + "3375": 3.85232, + "3380": 3.79563, + "3385": 3.81353, + "3390": 3.79143, + "3395": 3.86888, + "3400": 3.83997, + "3405": 3.86197, + "3410": 3.77529, + "3415": 3.72916, + "3420": 3.80048, + "3425": 3.81237, + "3430": 3.84497, + "3435": 3.80796, + "3440": 3.8267, + "3445": 3.7742, + "3450": 3.78787, + "3455": 3.80217, + "3460": 3.78265, + "3465": 3.75891, + "3470": 3.77341, + "3475": 3.77638, + "3480": 3.77988, + "3485": 3.80588, + "3490": 3.76958, + "3495": 3.80315, + "3500": 3.77047, + "3505": 3.77239, + "3510": 3.75092, + "3515": 3.80896, + "3520": 3.79879, + "3525": 3.76372, + "3530": 3.75322, + "3535": 3.76209, + "3540": 3.81796, + "3545": 3.72915, + "3550": 3.79201, + "3555": 3.72604, + "3560": 3.78622, + "3565": 3.7451, + "3570": 3.74254, + "3575": 3.71868, + "3580": 3.77066, + "3585": 3.76174, + "3590": 3.68853, + "3595": 3.76509, + "3600": 3.71336, + "3605": 3.71948, + "3610": 3.70916, + "3615": 3.74868, + "3620": 3.7837, + "3625": 3.71964, + "3630": 3.76519, + "3635": 3.68617, + "3640": 3.7093, + "3645": 3.74263, + "3650": 3.69638, + "3655": 3.72074, + "3660": 3.72832, + "3665": 3.74694, + "3670": 3.71178, + "3675": 3.71065, + "3680": 3.72416, + "3685": 3.67473, + "3690": 3.6936, + "3695": 3.68528, + "3700": 3.70814, + "3705": 3.67651, + "3710": 3.68493, + "3715": 3.6842, + "3720": 3.66563, + "3725": 3.64716, + "3730": 3.64883, + "3735": 3.68782, + "3740": 3.6732, + "3745": 3.66354, + "3750": 3.6757, + "3755": 3.66351, + "3760": 3.67285, + "3765": 3.66004, + "3770": 3.6516, + "3775": 3.63831, + "3780": 3.62453, + "3785": 3.6765, + "3790": 3.60163, + "3795": 3.64291, + "3800": 3.63275, + "3805": 3.62032, + "3810": 3.59475, + "3815": 3.63585, + "3820": 3.64099, + "3825": 3.6535, + "3830": 3.63864, + "3835": 3.59938, + "3840": 3.67685, + "3845": 3.65895, + "3850": 3.60064, + "3855": 3.60428, + "3860": 3.65711, + "3865": 3.60867, + "3870": 3.6721, + "3875": 3.58596, + "3880": 3.58212, + "3885": 3.60502, + "3890": 3.60969, + "3895": 3.5558, + "3900": 3.61685, + "3905": 3.59135, + "3910": 3.5772, + "3915": 3.5862, + "3920": 3.57131, + "3925": 3.56751, + "3930": 3.58005, + "3935": 3.5821, + "3940": 3.57511, + "3945": 3.56965, + "3950": 3.61887, + "3955": 3.57531, + "3960": 3.60735, + "3965": 3.58853, + "3970": 3.56735, + "3975": 3.56709, + "3980": 3.5304, + "3985": 3.60527, + "3990": 3.58124, + "3995": 3.60753, + "4000": 3.55811, + "4005": 3.54162, + "4010": 3.58376, + "4015": 3.58398, + "4020": 3.58355, + "4025": 3.57409, + "4030": 3.62855, + "4035": 3.57033, + "4040": 3.5882, + "4045": 3.60161, + "4050": 3.57522, + "4055": 3.57403, + "4060": 3.5888, + "4065": 3.58382, + "4070": 3.51488, + "4075": 3.55887, + "4080": 3.53108, + "4085": 3.54596, + "4090": 3.54584, + "4095": 3.53161, + "4100": 3.55106, + "4105": 3.53794, + "4110": 3.51736, + "4115": 3.56348, + "4120": 3.49648, + "4125": 3.49769, + "4130": 3.55149, + "4135": 3.54373, + "4140": 3.49112, + "4145": 3.51351, + "4150": 3.55497, + "4155": 3.48797, + "4160": 3.54539, + "4165": 3.56451, + "4170": 3.50424, + "4175": 3.50239, + "4180": 3.4998, + "4185": 3.5138, + "4190": 3.5011, + "4195": 3.50044, + "4200": 3.49424, + "4205": 3.53032, + "4210": 3.51921, + "4215": 3.52292, + "4220": 3.53088, + "4225": 3.50168, + "4230": 3.49756, + "4235": 3.52008, + "4240": 3.49249, + "4245": 3.49542, + "4250": 3.48848, + "4255": 3.50707, + "4260": 3.4676, + "4265": 3.48819, + "4270": 3.50473, + "4275": 3.53933, + "4280": 3.48997, + "4285": 3.50947, + "4290": 3.48405, + "4295": 3.48692, + "4300": 3.52631, + "4305": 3.48704, + "4310": 3.51358, + "4315": 3.50638, + "4320": 3.50379, + "4325": 3.51699, + "4330": 3.45992, + "4335": 3.49232, + "4340": 3.50354, + "4345": 3.43189, + "4350": 3.44845, + "4355": 3.52327, + "4360": 3.48083, + "4365": 3.47079, + "4370": 3.47624, + "4375": 3.44129, + "4380": 3.44296, + "4385": 3.42527, + "4390": 3.49048, + "4395": 3.47699, + "4400": 3.47442, + "4405": 3.41723, + "4410": 3.48335, + "4415": 3.44899, + "4420": 3.44113, + "4425": 3.47273, + "4430": 3.44742, + "4435": 3.49082, + "4440": 3.48522, + "4445": 3.43744, + "4450": 3.3974, + "4455": 3.4624, + "4460": 3.43415, + "4465": 3.45284, + "4470": 3.42199, + "4475": 3.45352, + "4480": 3.44375, + "4485": 3.43643, + "4490": 3.43453, + "4495": 3.38677, + "4500": 3.45384, + "4505": 3.43515, + "4510": 3.44292, + "4515": 3.40605, + "4520": 3.43888, + "4525": 3.40731, + "4530": 3.44131, + "4535": 3.3963, + "4540": 3.42067, + "4545": 3.43217, + "4550": 3.47418, + "4555": 3.39854, + "4560": 3.42732, + "4565": 3.37837, + "4570": 3.41702, + "4575": 3.41117, + "4580": 3.45362, + "4585": 3.42636, + "4590": 3.42388, + "4595": 3.39853, + "4600": 3.39686, + "4605": 3.42144, + "4610": 3.41286, + "4615": 3.45309, + "4620": 3.39526, + "4625": 3.42534, + "4630": 3.4127, + "4635": 3.39195, + "4640": 3.4264, + "4645": 3.41975, + "4650": 3.43542, + "4655": 3.40687, + "4660": 3.39737, + "4665": 3.41231, + "4670": 3.446, + "4675": 3.40423, + "4680": 3.42886, + "4685": 3.42464, + "4690": 3.39897, + "4695": 3.38, + "4700": 3.3729, + "4705": 3.35029, + "4710": 3.40571, + "4715": 3.39222, + "4720": 3.38774, + "4725": 3.35968, + "4730": 3.39519, + "4735": 3.32069, + "4740": 3.36458, + "4745": 3.40698, + "4750": 3.36053, + "4755": 3.39053, + "4760": 3.41421, + "4765": 3.36022, + "4770": 3.36502, + "4775": 3.36135, + "4780": 3.37362, + "4785": 3.374, + "4790": 3.41163, + "4795": 3.39334, + "4800": 3.34583, + "4805": 3.41139, + "4810": 3.35086, + "4815": 3.38903, + "4820": 3.34814, + "4825": 3.40406, + "4830": 3.38314, + "4835": 3.3693, + "4840": 3.38086, + "4845": 3.32726, + "4850": 3.39372, + "4855": 3.39679, + "4860": 3.32727, + "4865": 3.36392, + "4870": 3.34896, + "4875": 3.39123, + "4880": 3.39974, + "4885": 3.35153, + "4890": 3.36191, + "4895": 3.35318, + "4900": 3.32971, + "4905": 3.33008, + "4910": 3.32861, + "4915": 3.37524, + "4920": 3.35807, + "4925": 3.31242, + "4930": 3.34376, + "4935": 3.3273, + "4940": 3.28784, + "4945": 3.36034, + "4950": 3.29629, + "4955": 3.40365, + "4960": 3.3479, + "4965": 3.34204, + "4970": 3.33369, + "4975": 3.34388, + "4980": 3.36573, + "4985": 3.35352, + "4990": 3.33542, + "4995": 3.3795, + "5000": 3.30893, + "5005": 3.35715, + "5010": 3.36146, + "5015": 3.30923, + "5020": 3.28653, + "5025": 3.31605, + "5030": 3.32648, + "5035": 3.32963, + "5040": 3.30481, + "5045": 3.34994, + "5050": 3.30693, + "5055": 3.32632, + "5060": 3.28843, + "5065": 3.33396, + "5070": 3.33431, + "5075": 3.34337, + "5080": 3.31868, + "5085": 3.34518, + "5090": 3.32323, + "5095": 3.29022, + "5100": 3.32026, + "5105": 3.32744, + "5110": 3.3329, + "5115": 3.3038, + "5120": 3.34196, + "5125": 3.3184, + "5130": 3.31738, + "5135": 3.30105, + "5140": 3.3111, + "5145": 3.31125, + "5150": 3.32063, + "5155": 3.31567, + "5160": 3.31039, + "5165": 3.34534, + "5170": 3.23105, + "5175": 3.31877, + "5180": 3.28445, + "5185": 3.30691, + "5190": 3.32611, + "5195": 3.30561, + "5200": 3.31019, + "5205": 3.34654, + "5210": 3.28506, + "5215": 3.2874, + "5220": 3.28219, + "5225": 3.28677, + "5230": 3.32011, + "5235": 3.27975, + "5240": 3.27349, + "5245": 3.29646, + "5250": 3.3023, + "5255": 3.28615, + "5260": 3.31039, + "5265": 3.27007, + "5270": 3.25412, + "5275": 3.25534, + "5280": 3.28407, + "5285": 3.30874, + "5290": 3.2589, + "5295": 3.27448, + "5300": 3.27858, + "5305": 3.26656, + "5310": 3.32809, + "5315": 3.25873, + "5320": 3.30633, + "5325": 3.3111, + "5330": 3.27899, + "5335": 3.28833, + "5340": 3.23016, + "5345": 3.28336, + "5350": 3.28737, + "5355": 3.28737, + "5360": 3.23407, + "5365": 3.25011, + "5370": 3.28855, + "5375": 3.26985, + "5380": 3.24418, + "5385": 3.28394, + "5390": 3.28221, + "5395": 3.20448, + "5400": 3.30114, + "5405": 3.21525, + "5410": 3.29188, + "5415": 3.22284, + "5420": 3.25707, + "5425": 3.23689, + "5430": 3.24779, + "5435": 3.2811, + "5440": 3.21236, + "5445": 3.24176, + "5450": 3.24576, + "5455": 3.22991, + "5460": 3.25196, + "5465": 3.29692, + "5470": 3.27194, + "5475": 3.20136, + "5480": 3.28214, + "5485": 3.24325, + "5490": 3.26633, + "5495": 3.27183, + "5500": 3.22718, + "5505": 3.23914, + "5510": 3.28342, + "5515": 3.27035, + "5520": 3.23742, + "5525": 3.28473, + "5530": 3.22923, + "5535": 3.26258, + "5540": 3.25366, + "5545": 3.26198, + "5550": 3.24962, + "5555": 3.22875, + "5560": 3.22306, + "5565": 3.26845, + "5570": 3.22989, + "5575": 3.26435, + "5580": 3.23553, + "5585": 3.18594, + "5590": 3.24664, + "5595": 3.2105, + "5600": 3.25488, + "5605": 3.17461, + "5610": 3.2604, + "5615": 3.25606, + "5620": 3.2609, + "5625": 3.25214, + "5630": 3.24091, + "5635": 3.21924, + "5640": 3.24377, + "5645": 3.20743, + "5650": 3.2076, + "5655": 3.20542, + "5660": 3.20971, + "5665": 3.21069, + "5670": 3.20056, + "5675": 3.22863, + "5680": 3.19922, + "5685": 3.20573, + "5690": 3.2077, + "5695": 3.24414, + "5700": 3.19628, + "5705": 3.18515, + "5710": 3.17855, + "5715": 3.28582, + "5720": 3.2496, + "5725": 3.2002, + "5730": 3.24085, + "5735": 3.22905, + "5740": 3.22477, + "5745": 3.20281, + "5750": 3.23329, + "5755": 3.23832, + "5760": 3.22288, + "5765": 3.22651, + "5770": 3.25303, + "5775": 3.19712, + "5780": 3.21565, + "5785": 3.21756, + "5790": 3.22715, + "5795": 3.22463, + "5800": 3.16888, + "5805": 3.18332, + "5810": 3.22432, + "5815": 3.20302, + "5820": 3.16241, + "5825": 3.20754, + "5830": 3.1647, + "5835": 3.17395, + "5840": 3.20628, + "5845": 3.217, + "5850": 3.21594, + "5855": 3.15148, + "5860": 3.17119, + "5865": 3.20009, + "5870": 3.16136, + "5875": 3.20014, + "5880": 3.19456, + "5885": 3.19488, + "5890": 3.21776, + "5895": 3.23301, + "5900": 3.1895, + "5905": 3.21986, + "5910": 3.20185, + "5915": 3.17464, + "5920": 3.1915, + "5925": 3.15681, + "5930": 3.19135, + "5935": 3.19128, + "5940": 3.2051, + "5945": 3.21968, + "5950": 3.20213, + "5955": 3.16275, + "5960": 3.22598, + "5965": 3.17666, + "5970": 3.21828, + "5975": 3.18539, + "5980": 3.25556, + "5985": 3.14035, + "5990": 3.2373, + "5995": 3.15341, + "6000": 3.17562, + "6005": 3.15642, + "6010": 3.15958, + "6015": 3.16383, + "6020": 3.17057, + "6025": 3.20846, + "6030": 3.14683, + "6035": 3.20108, + "6040": 3.18034, + "6045": 3.19784, + "6050": 3.19841, + "6055": 3.17123, + "6060": 3.18513, + "6065": 3.20946, + "6070": 3.16514, + "6075": 3.13204, + "6080": 3.19182, + "6085": 3.15022, + "6090": 3.18799, + "6095": 3.18454, + "6100": 3.13968, + "6105": 3.18911, + "6110": 3.13194, + "6115": 3.18032, + "6120": 3.17268, + "6125": 3.17817, + "6130": 3.16826, + "6135": 3.16641, + "6140": 3.16491, + "6145": 3.14203, + "6150": 3.17849, + "6155": 3.14973, + "6160": 3.12836, + "6165": 3.15943, + "6170": 3.14366, + "6175": 3.14619, + "6180": 3.14564, + "6185": 3.18694, + "6190": 3.15491, + "6195": 3.12582, + "6200": 3.15218, + "6205": 3.14598, + "6210": 3.10092, + "6215": 3.15518, + "6220": 3.1544, + "6225": 3.17142, + "6230": 3.10668, + "6235": 3.14063, + "6240": 3.08394, + "6245": 3.18223, + "6250": 3.14309, + "6255": 3.15773, + "6260": 3.14125, + "6265": 3.15597, + "6270": 3.10065, + "6275": 3.12382, + "6280": 3.13503, + "6285": 3.11829, + "6290": 3.14415, + "6295": 3.15298, + "6300": 3.15403, + "6305": 3.21086, + "6310": 3.11266, + "6315": 3.10982, + "6320": 3.16047, + "6325": 3.10246, + "6330": 3.16954, + "6335": 3.15391, + "6340": 3.10904, + "6345": 3.16578, + "6350": 3.11808, + "6355": 3.11742, + "6360": 3.1108, + "6365": 3.14775, + "6370": 3.16278, + "6375": 3.1337, + "6380": 3.15125, + "6385": 3.17081, + "6390": 3.12597, + "6395": 3.10466, + "6400": 3.10591, + "6405": 3.18617, + "6410": 3.17298, + "6415": 3.12537, + "6420": 3.17096, + "6425": 3.17458, + "6430": 3.16659, + "6435": 3.12451, + "6440": 3.13606, + "6445": 3.15196, + "6450": 3.09161, + "6455": 3.08666, + "6460": 3.13082, + "6465": 3.16786, + "6470": 3.13951, + "6475": 3.13285, + "6480": 3.15191, + "6485": 3.11206, + "6490": 3.0797, + "6495": 3.16564, + "6500": 3.14177, + "6505": 3.08566, + "6510": 3.14483, + "6515": 3.16369, + "6520": 3.09044, + "6525": 3.14867, + "6530": 3.10896, + "6535": 3.12403, + "6540": 3.18005, + "6545": 3.11404, + "6550": 3.11103, + "6555": 3.10947, + "6560": 3.0737, + "6565": 3.07934, + "6570": 3.10438, + "6575": 3.05844, + "6580": 3.17411, + "6585": 3.10694, + "6590": 3.0877, + "6595": 3.10332, + "6600": 3.1032, + "6605": 3.08625, + "6610": 3.08405, + "6615": 3.1316, + "6620": 3.076, + "6625": 3.09705, + "6630": 3.09309, + "6635": 3.12933, + "6640": 3.08864, + "6645": 3.10948, + "6650": 3.1378, + "6655": 3.07416, + "6660": 3.11313, + "6665": 3.12487, + "6670": 3.08048, + "6675": 3.10457, + "6680": 3.10673, + "6685": 3.14077, + "6690": 3.11651, + "6695": 3.12176, + "6700": 3.1127, + "6705": 3.09107, + "6710": 3.10728, + "6715": 3.05842, + "6720": 3.13504, + "6725": 3.12621, + "6730": 3.1099, + "6735": 3.10898, + "6740": 3.11731, + "6745": 3.0901, + "6750": 3.10983, + "6755": 3.06749, + "6760": 3.06624, + "6765": 3.08509, + "6770": 3.07057, + "6775": 3.10523, + "6780": 3.07455, + "6785": 3.07959, + "6790": 3.10472, + "6795": 3.07166, + "6800": 3.09692, + "6805": 3.08719, + "6810": 3.10858, + "6815": 3.04354, + "6820": 3.07401, + "6825": 3.10257, + "6830": 3.08637, + "6835": 3.06002, + "6840": 3.0654, + "6845": 3.11054, + "6850": 3.08009, + "6855": 3.11065, + "6860": 3.06305, + "6865": 3.10876, + "6870": 3.07538, + "6875": 3.07578, + "6880": 3.08642, + "6885": 3.05135, + "6890": 3.0749, + "6895": 3.05299, + "6900": 3.05973, + "6905": 3.07506, + "6910": 3.09159, + "6915": 3.11333, + "6920": 3.06615, + "6925": 3.08379, + "6930": 3.06742, + "6935": 3.02485, + "6940": 3.06623, + "6945": 3.05639, + "6950": 3.07964, + "6955": 3.05853, + "6960": 3.05554, + "6965": 3.09907, + "6970": 3.03589, + "6975": 3.1075, + "6980": 3.06776, + "6985": 3.06784, + "6990": 3.11146, + "6995": 3.09126, + "7000": 3.02783, + "7005": 3.09757, + "7010": 3.0779, + "7015": 3.07385, + "7020": 3.10018, + "7025": 3.08417, + "7030": 3.08746, + "7035": 3.04096, + "7040": 3.01984, + "7045": 3.07968, + "7050": 3.09817, + "7055": 3.03816, + "7060": 3.09848, + "7065": 3.11109, + "7070": 3.05748, + "7075": 3.06319, + "7080": 3.11208, + "7085": 3.03557, + "7090": 3.05692, + "7095": 3.04652, + "7100": 3.07149, + "7105": 3.02035, + "7110": 3.0623, + "7115": 3.03547, + "7120": 3.07999, + "7125": 3.03377, + "7130": 3.04883, + "7135": 3.05627, + "7140": 3.06014, + "7145": 3.0691, + "7150": 3.02375, + "7155": 3.08612, + "7160": 3.0047, + "7165": 3.0418, + "7170": 3.07701, + "7175": 3.03661, + "7180": 3.07042, + "7185": 3.09125, + "7190": 3.05302, + "7195": 3.06058, + "7200": 3.06039, + "7205": 3.04153, + "7210": 3.08703, + "7215": 3.06723, + "7220": 3.08798, + "7225": 3.06993, + "7230": 3.07403, + "7235": 3.05435, + "7240": 3.05017, + "7245": 3.07131, + "7250": 3.01274, + "7255": 3.03229, + "7260": 3.06928, + "7265": 3.00261, + "7270": 3.04138, + "7275": 3.04223, + "7280": 3.04181, + "7285": 3.05407, + "7290": 3.07344, + "7295": 3.06537, + "7300": 3.02809, + "7305": 3.02877, + "7310": 3.04926, + "7315": 3.07646, + "7320": 3.05669, + "7325": 3.06149, + "7330": 3.02592, + "7335": 3.02733, + "7340": 3.06004, + "7345": 3.0091, + "7350": 3.06031, + "7355": 3.04495, + "7360": 3.03923, + "7365": 3.03845, + "7370": 3.03136, + "7375": 2.9999, + "7380": 3.06202, + "7385": 3.07693, + "7390": 3.06411, + "7395": 3.02221, + "7400": 3.07516, + "7405": 3.04382, + "7410": 3.06023, + "7415": 3.05228, + "7420": 3.03261, + "7425": 3.08586, + "7430": 3.0272, + "7435": 3.01757, + "7440": 3.0377, + "7445": 3.01394, + "7450": 2.99482, + "7455": 3.04735, + "7460": 3.04105, + "7465": 3.04977, + "7470": 3.05673, + "7475": 3.06741, + "7480": 3.02749, + "7485": 2.98653, + "7490": 2.98973, + "7495": 2.99863, + "7500": 3.02945, + "7505": 3.0059, + "7510": 2.97871, + "7515": 3.02404, + "7520": 3.01697, + "7525": 2.98295, + "7530": 3.02636, + "7535": 3.04423, + "7540": 3.02494, + "7545": 3.0588, + "7550": 3.06534, + "7555": 3.00732, + "7560": 3.01283, + "7565": 3.00874, + "7570": 3.03442, + "7575": 2.97962, + "7580": 3.03034, + "7585": 3.01793, + "7590": 3.01504, + "7595": 3.07403, + "7600": 3.03015, + "7605": 3.02144, + "7610": 3.00533, + "7615": 2.99602, + "7620": 2.99265, + "7625": 3.03762, + "7630": 3.02026, + "7635": 3.01854, + "7640": 3.01712, + "7645": 3.04845, + "7650": 3.04439, + "7655": 3.08975, + "7660": 2.96325, + "7665": 3.02969, + "7670": 3.01245, + "7675": 3.00305, + "7680": 2.9998, + "7685": 3.07016, + "7690": 3.01368, + "7695": 2.99671, + "7700": 3.05056, + "7705": 3.01282, + "7710": 3.05828, + "7715": 2.99725, + "7720": 3.08276, + "7725": 2.98411, + "7730": 2.99881, + "7735": 3.02714, + "7740": 3.00979, + "7745": 3.00319, + "7750": 3.01, + "7755": 3.01954, + "7760": 2.98571, + "7765": 3.00397, + "7770": 3.02732, + "7775": 2.98978, + "7780": 2.97862, + "7785": 3.01472, + "7790": 2.99842, + "7795": 3.02413, + "7800": 3.00827, + "7805": 3.01176, + "7810": 3.03082, + "7815": 3.00244, + "7820": 3.0019, + "7825": 3.03231, + "7830": 3.03143, + "7835": 2.96605, + "7840": 3.04336, + "7845": 2.97937, + "7850": 2.93977, + "7855": 2.98529, + "7860": 2.98344, + "7865": 3.02956, + "7870": 2.9691, + "7875": 2.98838, + "7880": 3.00349, + "7885": 2.9968, + "7890": 3.03811, + "7895": 3.02857, + "7900": 3.03097, + "7905": 2.99876, + "7910": 3.0088, + "7915": 3.02527, + "7920": 3.01259, + "7925": 2.99646, + "7930": 3.02866, + "7935": 2.98913, + "7940": 3.03573, + "7945": 3.0501, + "7950": 2.96381, + "7955": 2.98711, + "7960": 2.96943, + "7965": 2.94566, + "7970": 2.9655, + "7975": 2.99544, + "7980": 3.00887, + "7985": 2.97698, + "7990": 2.97506, + "7995": 2.96124, + "8000": 3.02098, + "8005": 2.9801, + "8010": 2.97649, + "8015": 2.96466, + "8020": 2.97779, + "8025": 2.95601, + "8030": 2.97562, + "8035": 2.97196, + "8040": 2.95703, + "8045": 3.01604, + "8050": 3.01297, + "8055": 2.97453, + "8060": 3.00494, + "8065": 2.98862, + "8070": 2.96753, + "8075": 2.97734, + "8080": 3.01019, + "8085": 2.96754, + "8090": 2.98003, + "8095": 3.00216, + "8100": 2.95105, + "8105": 2.99247, + "8110": 2.98157, + "8115": 2.95999, + "8120": 2.97249, + "8125": 2.99946, + "8130": 2.97003, + "8135": 2.98766, + "8140": 2.96736, + "8145": 2.95939, + "8150": 2.98009, + "8155": 2.95146, + "8160": 2.997, + "8165": 2.9913, + "8170": 2.95554, + "8175": 2.95554, + "8180": 3.01376, + "8185": 2.98624, + "8190": 3.02032, + "8195": 2.99613, + "8200": 2.96412, + "8205": 2.97566, + "8210": 2.9781, + "8215": 2.99017, + "8220": 2.971, + "8225": 2.96329, + "8230": 2.99505, + "8235": 3.00306, + "8240": 2.97419, + "8245": 2.9738, + "8250": 3.00958, + "8255": 2.96716, + "8260": 2.97331, + "8265": 2.95555, + "8270": 2.97514, + "8275": 2.96718, + "8280": 2.94092, + "8285": 2.97838, + "8290": 2.96734, + "8295": 2.95246, + "8300": 2.96504, + "8305": 2.97504, + "8310": 2.97996, + "8315": 2.95732, + "8320": 2.97776, + "8325": 2.929, + "8330": 2.89908, + "8335": 2.96646, + "8340": 2.99201, + "8345": 2.94463, + "8350": 2.95886, + "8355": 2.98631, + "8360": 2.96643, + "8365": 2.98326, + "8370": 2.99094, + "8375": 2.93854, + "8380": 2.94099, + "8385": 2.97126, + "8390": 2.9453, + "8395": 2.97523, + "8400": 2.95927, + "8405": 2.97418, + "8410": 3.03057, + "8415": 2.93533, + "8420": 2.91801, + "8425": 2.97564, + "8430": 2.97808, + "8435": 2.93124, + "8440": 3.01239, + "8445": 2.99121, + "8450": 2.96616, + "8455": 2.97106, + "8460": 2.97975, + "8465": 2.92562, + "8470": 2.94697, + "8475": 2.99054, + "8480": 2.93097, + "8485": 2.93977, + "8490": 2.948, + "8495": 2.93336, + "8500": 2.96904, + "8505": 2.92233, + "8510": 3.00332, + "8515": 2.94052, + "8520": 2.95755, + "8525": 2.88522, + "8530": 2.95834, + "8535": 2.97603, + "8540": 2.93194, + "8545": 2.95741, + "8550": 2.92307, + "8555": 2.98961, + "8560": 2.99424, + "8565": 2.9514, + "8570": 2.94707, + "8575": 2.93509, + "8580": 2.9669, + "8585": 2.976, + "8590": 2.97659, + "8595": 2.97731, + "8600": 2.94787, + "8605": 2.94545, + "8610": 2.95479, + "8615": 2.96032, + "8620": 2.92346, + "8625": 2.94581, + "8630": 2.95087, + "8635": 2.94522, + "8640": 2.92578, + "8645": 2.98133, + "8650": 2.92232, + "8655": 2.96592, + "8660": 2.97073, + "8665": 2.95471, + "8670": 2.96657, + "8675": 2.93996, + "8680": 2.93576, + "8685": 2.94815, + "8690": 2.96442, + "8695": 2.97067, + "8700": 2.94799, + "8705": 2.91745, + "8710": 2.96979, + "8715": 2.91522, + "8720": 2.97447, + "8725": 2.94876, + "8730": 2.94256, + "8735": 2.97158, + "8740": 2.92587, + "8745": 2.96492, + "8750": 2.96628, + "8755": 2.93098, + "8760": 2.94924, + "8765": 2.91354, + "8770": 2.96822, + "8775": 2.94219, + "8780": 2.92859, + "8785": 2.94726, + "8790": 2.92803, + "8795": 2.96489, + "8800": 2.92662, + "8805": 2.90115, + "8810": 2.93145, + "8815": 2.93283, + "8820": 2.90387, + "8825": 2.92443, + "8830": 2.91245, + "8835": 2.89847, + "8840": 2.91518, + "8845": 2.92785, + "8850": 2.95695, + "8855": 2.92839, + "8860": 2.98878, + "8865": 2.93356, + "8870": 2.90865, + "8875": 2.92162, + "8880": 2.9295, + "8885": 2.9207, + "8890": 2.9404, + "8895": 2.92179, + "8900": 2.94464, + "8905": 2.93594, + "8910": 2.91993, + "8915": 2.90336, + "8920": 2.91127, + "8925": 2.97428, + "8930": 2.96209, + "8935": 2.97189, + "8940": 2.94882, + "8945": 2.94789, + "8950": 2.9328, + "8955": 2.91679, + "8960": 2.89858, + "8965": 2.92721, + "8970": 2.94082, + "8975": 2.90449, + "8980": 2.89797, + "8985": 2.92102, + "8990": 2.9662, + "8995": 2.9373, + "9000": 2.89467, + "9005": 2.9399, + "9010": 2.97901, + "9015": 2.90311, + "9020": 2.90423, + "9025": 2.92238, + "9030": 2.94518, + "9035": 2.85736, + "9040": 2.93491, + "9045": 2.92378, + "9050": 2.96087, + "9055": 2.88884, + "9060": 2.95609, + "9065": 2.98682, + "9070": 2.92665, + "9075": 2.94254, + "9080": 2.93301, + "9085": 2.9439, + "9090": 2.93648, + "9095": 2.89849, + "9100": 2.90017, + "9105": 2.89, + "9110": 2.93211, + "9115": 2.93981, + "9120": 2.97397, + "9125": 2.91648, + "9130": 2.92277, + "9135": 2.94086, + "9140": 2.94695, + "9145": 2.89447, + "9150": 2.92217, + "9155": 2.93169, + "9160": 2.93686, + "9165": 2.92557, + "9170": 2.9498, + "9175": 2.88716, + "9180": 2.93307, + "9185": 2.8947, + "9190": 2.94894, + "9195": 2.91222, + "9200": 2.93251, + "9205": 2.88702, + "9210": 2.93304, + "9215": 2.87965, + "9220": 2.90288, + "9225": 2.93315, + "9230": 2.86569, + "9235": 2.87842, + "9240": 2.89576, + "9245": 2.88279, + "9250": 2.88136, + "9255": 2.91192, + "9260": 2.87817, + "9265": 2.92175, + "9270": 2.89613, + "9275": 2.91313, + "9280": 2.91939, + "9285": 2.91903, + "9290": 2.93047, + "9295": 2.92844, + "9300": 2.87877, + "9305": 2.90909, + "9310": 2.89871, + "9315": 2.86609, + "9320": 2.86065, + "9325": 2.90436, + "9330": 2.95511, + "9335": 2.87572, + "9340": 2.93845, + "9345": 2.94693, + "9350": 2.9134, + "9355": 2.87737, + "9360": 2.89674, + "9365": 2.8823, + "9370": 2.93386, + "9375": 2.91236, + "9380": 2.86428, + "9385": 2.91358, + "9390": 2.92324, + "9395": 2.92024, + "9400": 2.89599, + "9405": 2.89197, + "9410": 2.9185, + "9415": 2.91775, + "9420": 2.89381, + "9425": 2.89983, + "9430": 2.87833, + "9435": 2.90417, + "9440": 2.89629, + "9445": 2.88366, + "9450": 2.89069, + "9455": 2.88969, + "9460": 2.94442, + "9465": 2.94721, + "9470": 2.88553, + "9475": 2.94033, + "9480": 2.88982, + "9485": 2.87815, + "9490": 2.89723, + "9495": 2.9225, + "9500": 2.89514, + "9505": 2.86794, + "9510": 2.894, + "9515": 2.90369, + "9520": 2.91102, + "9525": 2.89095, + "9530": 2.88696, + "9535": 2.91216 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 1021640256.0, + "5": 1024063424.0, + "10": 1014250560.0, + "15": 1024077504.0, + "20": 1022486144.0, + "25": 1041373312.0, + "30": 1028112896.0, + "35": 1035625088.0, + "40": 1026328384.0, + "45": 1022350080.0, + "50": 1030098560.0, + "55": 1028966144.0, + "60": 1036320640.0, + "65": 1034679168.0, + "70": 1029374848.0, + "75": 1028745088.0, + "80": 1047575040.0, + "85": 1029448064.0, + "90": 1020467392.0, + "95": 1028310016.0, + "100": 1040961344.0, + "105": 1039436544.0, + "110": 1026879104.0, + "115": 1052312832.0, + "120": 1018863104.0, + "125": 1045372160.0, + "130": 1034330368.0, + "135": 1016615680.0, + "140": 1038582272.0, + "145": 1020688640.0, + "150": 1039788096.0, + "155": 1032796928.0, + "160": 1020952640.0, + "165": 1032424512.0, + "170": 1017396096.0, + "175": 1033427072.0, + "180": 1036119424.0, + "185": 1030573760.0, + "190": 1035673984.0, + "195": 1034555520.0, + "200": 1040973824.0, + "205": 1048500352.0, + "210": 1054481024.0, + "215": 1025159552.0, + "220": 1044962496.0, + "225": 1038076416.0, + "230": 1026222720.0, + "235": 1051134976.0, + "240": 1029276416.0, + "245": 1031397824.0, + "250": 1027879616.0, + "255": 1016929792.0, + "260": 1045008896.0, + "265": 1021330688.0, + "270": 1030964864.0, + "275": 1036911744.0, + "280": 1031743488.0, + "285": 1015014016.0, + "290": 1018756352.0, + "295": 1017237504.0, + "300": 1034761152.0, + "305": 1032166144.0, + "310": 1035583104.0, + "315": 1012734272.0, + "320": 1008275072.0, + "325": 1042741760.0, + "330": 1042870656.0, + "335": 1033508480.0, + "340": 1014464512.0, + "345": 1042618880.0, + "350": 1031852736.0, + "355": 1050844800.0, + "360": 1030258432.0, + "365": 1034595648.0, + "370": 1019436032.0, + "375": 1022144832.0, + "380": 1021326592.0, + "385": 1025589504.0, + "390": 1023195072.0, + "395": 1019653952.0, + "400": 1033520512.0, + "405": 1023880192.0, + "410": 1017910016.0, + "415": 1024288000.0, + "420": 1020624256.0, + "425": 1025854848.0, + "430": 1033854336.0, + "435": 1028182400.0, + "440": 1022090752.0, + "445": 1036768256.0, + "450": 1024997376.0, + "455": 1013852096.0, + "460": 1022093824.0, + "465": 1041431552.0, + "470": 1029038016.0, + "475": 1010065792.0, + "480": 1047607616.0, + "485": 1029724928.0, + "490": 1044668160.0, + "495": 1025229952.0, + "500": 1037464960.0, + "505": 1032181376.0, + "510": 1042853056.0, + "515": 1026159744.0, + "520": 1013409792.0, + "525": 1035147520.0, + "530": 1016375552.0, + "535": 1040113024.0, + "540": 1035052352.0, + "545": 1032113664.0, + "550": 1018673408.0, + "555": 1008638656.0, + "560": 1011927680.0, + "565": 1041824320.0, + "570": 1034942208.0, + "575": 1010199040.0, + "580": 1032210496.0, + "585": 1041262144.0, + "590": 1038867968.0, + "595": 1035743104.0, + "600": 1023772736.0, + "605": 1032294272.0, + "610": 1037748672.0, + "615": 1005974784.0, + "620": 1040407424.0, + "625": 1045209216.0, + "630": 1034414464.0, + "635": 1028523008.0, + "640": 1022644928.0, + "645": 1035876032.0, + "650": 1009255680.0, + "655": 997757696.0, + "660": 1029710464.0, + "665": 1025532608.0, + "670": 1048812288.0, + "675": 1025202688.0, + "680": 1019340032.0, + "685": 1027832512.0, + "690": 1029230080.0, + "695": 1040024576.0, + "700": 1042031680.0, + "705": 1034382976.0, + "710": 1020441792.0, + "715": 1031472128.0, + "720": 1040274560.0, + "725": 1023279936.0, + "730": 1022792704.0, + "735": 1025085696.0, + "740": 1038382656.0, + "745": 1045205504.0, + "750": 1013180928.0, + "755": 1031644032.0, + "760": 1032783552.0, + "765": 1027135936.0, + "770": 1023967232.0, + "775": 1025895168.0, + "780": 1038166464.0, + "785": 1025486400.0, + "790": 1040810624.0, + "795": 1032531200.0, + "800": 1039592768.0, + "805": 1024318016.0, + "810": 1034725632.0, + "815": 1036000448.0, + "820": 1035671552.0, + "825": 1051375360.0, + "830": 1035406784.0, + "835": 1022547776.0, + "840": 1036875648.0, + "845": 1025700352.0, + "850": 1048529920.0, + "855": 1014986432.0, + "860": 1033098624.0, + "865": 1031543040.0, + "870": 1040902912.0, + "875": 1023938304.0, + "880": 1028395904.0, + "885": 1054406656.0, + "890": 1019537152.0, + "895": 1045189824.0, + "900": 1031772928.0, + "905": 1020970688.0, + "910": 1031386112.0, + "915": 1032926912.0, + "920": 1038459392.0, + "925": 1026754560.0, + "930": 1025378752.0, + "935": 1031126464.0, + "940": 1057933568.0, + "945": 1029823104.0, + "950": 1014412480.0, + "955": 1032173696.0, + "960": 1026152064.0, + "965": 1062678976.0, + "970": 1030096128.0, + "975": 1036903680.0, + "980": 1027049216.0, + "985": 1030676736.0, + "990": 1020676864.0, + "995": 1042301760.0, + "1000": 1036831616.0, + "1005": 1050206080.0, + "1010": 1023801984.0, + "1015": 1020539008.0, + "1020": 1042587392.0, + "1025": 1037943808.0, + "1030": 1049210048.0, + "1035": 1012483456.0, + "1040": 1023092032.0, + "1045": 1039520768.0, + "1050": 1026825728.0, + "1055": 1034861184.0, + "1060": 1046128704.0, + "1065": 1036804096.0, + "1070": 1019994880.0, + "1075": 1025341696.0, + "1080": 1014979200.0, + "1085": 1030007744.0, + "1090": 1029062016.0, + "1095": 1020309888.0, + "1100": 1039835008.0, + "1105": 1048600064.0, + "1110": 1020704448.0, + "1115": 1024782720.0, + "1120": 1061896576.0, + "1125": 1043311616.0, + "1130": 1031219456.0, + "1135": 1041360512.0, + "1140": 1021486272.0, + "1145": 1051696128.0, + "1150": 1035590400.0, + "1155": 1029590528.0, + "1160": 1042564800.0, + "1165": 1026810496.0, + "1170": 1018001408.0, + "1175": 1033684032.0, + "1180": 1035633536.0, + "1185": 1023928960.0, + "1190": 1033160320.0, + "1195": 1024228608.0, + "1200": 1039116544.0, + "1205": 1031740800.0, + "1210": 1053250560.0, + "1215": 1024617600.0, + "1220": 1009041280.0, + "1225": 1036679680.0, + "1230": 1041257984.0, + "1235": 1053974912.0, + "1240": 1030356224.0, + "1245": 1017684864.0, + "1250": 1022772992.0, + "1255": 1033439104.0, + "1260": 1034284736.0, + "1265": 1034003840.0, + "1270": 1037323264.0, + "1275": 1029345792.0, + "1280": 1046489856.0, + "1285": 1028285120.0, + "1290": 1036578176.0, + "1295": 1032421696.0, + "1300": 1033065728.0, + "1305": 1030027008.0, + "1310": 1051262976.0, + "1315": 1035373184.0, + "1320": 1028263936.0, + "1325": 1049972736.0, + "1330": 1030133376.0, + "1335": 1031164800.0, + "1340": 1012758912.0, + "1345": 1044639232.0, + "1350": 1034957312.0, + "1355": 1033623744.0, + "1360": 1036683392.0, + "1365": 1038588672.0, + "1370": 1039851904.0, + "1375": 1034117632.0, + "1380": 1022886656.0, + "1385": 1018084096.0, + "1390": 1049054400.0, + "1395": 1034868352.0, + "1400": 1034998144.0, + "1405": 1034131456.0, + "1410": 1036368256.0, + "1415": 1043577600.0, + "1420": 1026111104.0, + "1425": 1033320320.0, + "1430": 1012808128.0, + "1435": 1038394880.0, + "1440": 1020971904.0, + "1445": 1032459904.0, + "1450": 1014039296.0, + "1455": 1011673984.0, + "1460": 1043275904.0, + "1465": 1014361600.0, + "1470": 1020655360.0, + "1475": 1030231296.0, + "1480": 1029370496.0, + "1485": 1022997696.0, + "1490": 1026783360.0, + "1495": 1021815744.0, + "1500": 1027177088.0, + "1505": 1034882880.0, + "1510": 1014397120.0, + "1515": 1042136832.0, + "1520": 1025792640.0, + "1525": 1036335872.0, + "1530": 1039948992.0, + "1535": 1047640192.0, + "1540": 1043539840.0, + "1545": 1034043520.0, + "1550": 1016108736.0, + "1555": 1015573504.0, + "1560": 1055021824.0, + "1565": 1015593728.0, + "1570": 1018243840.0, + "1575": 1032515456.0, + "1580": 1012984768.0, + "1585": 1025327680.0, + "1590": 1034127360.0, + "1595": 1057393664.0, + "1600": 1026867584.0, + "1605": 1019994624.0, + "1610": 1031268736.0, + "1615": 1035274880.0, + "1620": 1018016000.0, + "1625": 1028272512.0, + "1630": 1027205376.0, + "1635": 1023799040.0, + "1640": 1034120832.0, + "1645": 1021814528.0, + "1650": 1015262080.0, + "1655": 1018280064.0, + "1660": 1047982976.0, + "1665": 1027060352.0, + "1670": 1048219904.0, + "1675": 1021102912.0, + "1680": 1043288320.0, + "1685": 1052719360.0, + "1690": 1026724032.0, + "1695": 1040385280.0, + "1700": 1018036352.0, + "1705": 1020480640.0, + "1710": 1021024448.0, + "1715": 1026932992.0, + "1720": 1028350208.0, + "1725": 1034363136.0, + "1730": 1013692352.0, + "1735": 1018429696.0, + "1740": 1057257024.0, + "1745": 1029261952.0, + "1750": 1024357888.0, + "1755": 1029970112.0, + "1760": 1022192512.0, + "1765": 1040477056.0, + "1770": 1029669760.0, + "1775": 1046196864.0, + "1780": 1021955712.0, + "1785": 1035109376.0, + "1790": 1028263808.0, + "1795": 1031023616.0, + "1800": 1028300480.0, + "1805": 1025669248.0, + "1810": 1021556096.0, + "1815": 1033440256.0, + "1820": 1034885888.0, + "1825": 1020208448.0, + "1830": 1013885632.0, + "1835": 1031382272.0, + "1840": 1040391040.0, + "1845": 1034828800.0, + "1850": 1014480064.0, + "1855": 1019418816.0, + "1860": 1019569536.0, + "1865": 1035942400.0, + "1870": 1026242368.0, + "1875": 1031525248.0, + "1880": 1011590784.0, + "1885": 1041065536.0, + "1890": 1035000704.0, + "1895": 1028959488.0, + "1900": 1033997568.0, + "1905": 1027123776.0, + "1910": 1029217792.0, + "1915": 1030492864.0, + "1920": 1042920384.0, + "1925": 1038419392.0, + "1930": 1019304512.0, + "1935": 1032535936.0, + "1940": 1027806336.0, + "1945": 1034205056.0, + "1950": 1006036224.0, + "1955": 1032577600.0, + "1960": 1015720256.0, + "1965": 1029088512.0, + "1970": 1021554176.0, + "1975": 1034048000.0, + "1980": 1029366912.0, + "1985": 1027784960.0, + "1990": 1020947840.0, + "1995": 1010422912.0, + "2000": 1039617152.0, + "2005": 1001486208.0, + "2010": 1020422912.0, + "2015": 1032034048.0, + "2020": 1036298624.0, + "2025": 1037172352.0, + "2030": 1029770752.0, + "2035": 1040333312.0, + "2040": 1030112768.0, + "2045": 1032700800.0, + "2050": 1008016064.0, + "2055": 1045723840.0, + "2060": 1028142400.0, + "2065": 1038799488.0, + "2070": 1045645184.0, + "2075": 1035237952.0, + "2080": 1022882304.0, + "2085": 1024815424.0, + "2090": 1034363392.0, + "2095": 1005220672.0, + "2100": 1034644096.0, + "2105": 1035581312.0, + "2110": 1030685952.0, + "2115": 1029798528.0, + "2120": 1018846080.0, + "2125": 1021863168.0, + "2130": 1026638080.0, + "2135": 1053279488.0, + "2140": 1017060608.0, + "2145": 1019635072.0, + "2150": 1037130752.0, + "2155": 1033302784.0, + "2160": 1049035776.0, + "2165": 1039682816.0, + "2170": 1020308096.0, + "2175": 1027338752.0, + "2180": 1041703168.0, + "2185": 1028895360.0, + "2190": 1029309888.0, + "2195": 1028944768.0, + "2200": 1039639680.0, + "2205": 1036972288.0, + "2210": 1031740544.0, + "2215": 1021404480.0, + "2220": 1020910848.0, + "2225": 1033403072.0, + "2230": 1014201856.0, + "2235": 1029395968.0, + "2240": 1029885184.0, + "2245": 1026005824.0, + "2250": 1046268800.0, + "2255": 1032951936.0, + "2260": 1047494592.0, + "2265": 1023721088.0, + "2270": 1022566144.0, + "2275": 1028537600.0, + "2280": 1034973568.0, + "2285": 1031819968.0, + "2290": 1038650048.0, + "2295": 1028816000.0, + "2300": 1034450496.0, + "2305": 1032314496.0, + "2310": 1013586496.0, + "2315": 1048182656.0, + "2320": 1035210368.0, + "2325": 1046966016.0, + "2330": 1014696192.0, + "2335": 1027382272.0, + "2340": 1036736512.0, + "2345": 1020186944.0, + "2350": 1031017728.0, + "2355": 1037474240.0, + "2360": 1032608128.0, + "2365": 1028041856.0, + "2370": 1021004224.0, + "2375": 1022912000.0, + "2380": 1048556224.0, + "2385": 1044140736.0, + "2390": 1021986816.0, + "2395": 1020595584.0, + "2400": 1026930816.0, + "2405": 1038387200.0, + "2410": 1045395200.0, + "2415": 1048454656.0, + "2420": 1032227712.0, + "2425": 1029562176.0, + "2430": 1030386176.0, + "2435": 1029217856.0, + "2440": 1029168000.0, + "2445": 1033132160.0, + "2450": 1038557824.0, + "2455": 1034721536.0, + "2460": 1039984192.0, + "2465": 1032500992.0, + "2470": 1024143872.0, + "2475": 1016539520.0, + "2480": 1023613248.0, + "2485": 1021030592.0, + "2490": 1035920448.0, + "2495": 1032967360.0, + "2500": 1028107008.0, + "2505": 1015385600.0, + "2510": 1030967104.0, + "2515": 1025700096.0, + "2520": 1033326208.0, + "2525": 1029692800.0, + "2530": 1023986560.0, + "2535": 1071069696.0, + "2540": 1024537984.0, + "2545": 1033798784.0, + "2550": 1029448064.0, + "2555": 1029183488.0, + "2560": 1018115072.0, + "2565": 1031598528.0, + "2570": 1022847232.0, + "2575": 1026503104.0, + "2580": 1038622592.0, + "2585": 1025899456.0, + "2590": 1026100800.0, + "2595": 1046623104.0, + "2600": 1031103360.0, + "2605": 1001910656.0, + "2610": 1028423360.0, + "2615": 1025564544.0, + "2620": 1038651392.0, + "2625": 1026996352.0, + "2630": 1036831424.0, + "2635": 1021198400.0, + "2640": 1021865856.0, + "2645": 1039153408.0, + "2650": 1025943488.0, + "2655": 1013255808.0, + "2660": 1032645248.0, + "2665": 1035218048.0, + "2670": 1036437632.0, + "2675": 1039296064.0, + "2680": 1041661696.0, + "2685": 1034565504.0, + "2690": 1058871168.0, + "2695": 1019879552.0, + "2700": 1062626816.0, + "2705": 1035376320.0, + "2710": 1019542400.0, + "2715": 1031885824.0, + "2720": 1016403200.0, + "2725": 1040594688.0, + "2730": 1019586688.0, + "2735": 1030889856.0, + "2740": 1029290752.0, + "2745": 1040687744.0, + "2750": 1023880448.0, + "2755": 1011865664.0, + "2760": 1027684864.0, + "2765": 1030882240.0, + "2770": 1033119872.0, + "2775": 1026332352.0, + "2780": 1033684224.0, + "2785": 1024589888.0, + "2790": 1033734272.0, + "2795": 1045949184.0, + "2800": 1040286016.0, + "2805": 1019944192.0, + "2810": 1031449600.0, + "2815": 1030932736.0, + "2820": 1037855616.0, + "2825": 1041684096.0, + "2830": 1030459904.0, + "2835": 1013508352.0, + "2840": 1031449600.0, + "2845": 1030129920.0, + "2850": 1026617600.0, + "2855": 1024705280.0, + "2860": 1031700096.0, + "2865": 1027428800.0, + "2870": 1026690048.0, + "2875": 1012777024.0, + "2880": 1038301568.0, + "2885": 1017901184.0, + "2890": 1044200064.0, + "2895": 1036459136.0, + "2900": 1030652928.0, + "2905": 1035957376.0, + "2910": 1038718272.0, + "2915": 1039385408.0, + "2920": 1034781248.0, + "2925": 1043267840.0, + "2930": 1038229696.0, + "2935": 1021222144.0, + "2940": 1042307456.0, + "2945": 1045232384.0, + "2950": 1047525952.0, + "2955": 1034172928.0, + "2960": 1020891904.0, + "2965": 1027307840.0, + "2970": 1038796288.0, + "2975": 1034007296.0, + "2980": 1049590400.0, + "2985": 1034846016.0, + "2990": 1026008576.0, + "2995": 1034919296.0, + "3000": 1039017856.0, + "3005": 1038158848.0, + "3010": 1010907712.0, + "3015": 1044976064.0, + "3020": 1034050688.0, + "3025": 1037763840.0, + "3030": 1027722816.0, + "3035": 1041821056.0, + "3040": 1035311872.0, + "3045": 1027255296.0, + "3050": 1029708032.0, + "3055": 1028029568.0, + "3060": 1049976960.0, + "3065": 1024067200.0, + "3070": 1011545728.0, + "3075": 1042846272.0, + "3080": 1036094912.0, + "3085": 1030387456.0, + "3090": 1035262976.0, + "3095": 1013803008.0, + "3100": 1030144896.0, + "3105": 1017609088.0, + "3110": 1033370816.0, + "3115": 1023737728.0, + "3120": 1024877504.0, + "3125": 1046537216.0, + "3130": 1024676160.0, + "3135": 1025722496.0, + "3140": 1043778176.0, + "3145": 1044372672.0, + "3150": 1016483328.0, + "3155": 1042487936.0, + "3160": 1026834688.0, + "3165": 1031199360.0, + "3170": 1024332800.0, + "3175": 1024368640.0, + "3180": 1018204288.0, + "3185": 1034352512.0, + "3190": 1019221888.0, + "3195": 1028425408.0, + "3200": 1036080640.0, + "3205": 1016076160.0, + "3210": 1034109312.0, + "3215": 1031349312.0, + "3220": 1040833664.0, + "3225": 1022835008.0, + "3230": 1033255744.0, + "3235": 1019975488.0, + "3240": 1038131840.0, + "3245": 1031643136.0, + "3250": 1022390656.0, + "3255": 1032876672.0, + "3260": 1037751616.0, + "3265": 1021622656.0, + "3270": 1031242880.0, + "3275": 1038461184.0, + "3280": 1023236992.0, + "3285": 1031615424.0, + "3290": 1045247616.0, + "3295": 1043177536.0, + "3300": 1035084224.0, + "3305": 1042662400.0, + "3310": 1058092096.0, + "3315": 1024282880.0, + "3320": 1046015296.0, + "3325": 1023179008.0, + "3330": 1048037248.0, + "3335": 1036690560.0, + "3340": 1042123392.0, + "3345": 1030897920.0, + "3350": 1020621696.0, + "3355": 1025960576.0, + "3360": 1030305344.0, + "3365": 1031171520.0, + "3370": 1036454144.0, + "3375": 1023472384.0, + "3380": 1032383744.0, + "3385": 1038081536.0, + "3390": 1052811072.0, + "3395": 1012090496.0, + "3400": 1019209600.0, + "3405": 1021780224.0, + "3410": 1028433728.0, + "3415": 1058222400.0, + "3420": 1033492480.0, + "3425": 1029580352.0, + "3430": 1021150976.0, + "3435": 1034991872.0, + "3440": 1017961600.0, + "3445": 1025537280.0, + "3450": 1032254336.0, + "3455": 1036261312.0, + "3460": 1052071808.0, + "3465": 1027114240.0, + "3470": 1043729536.0, + "3475": 1033265792.0, + "3480": 1026619776.0, + "3485": 1029215232.0, + "3490": 1041041408.0, + "3495": 1019252224.0, + "3500": 1032059904.0, + "3505": 1025753728.0, + "3510": 1044367616.0, + "3515": 1013817280.0, + "3520": 1021846400.0, + "3525": 1032175552.0, + "3530": 1029789056.0, + "3535": 1034568704.0, + "3540": 1017731456.0, + "3545": 1035658880.0, + "3550": 1024535296.0, + "3555": 1035866112.0, + "3560": 1029737600.0, + "3565": 1028900160.0, + "3570": 1046029888.0, + "3575": 1039186304.0, + "3580": 1010838336.0, + "3585": 1031737728.0, + "3590": 1041450688.0, + "3595": 1037636800.0, + "3600": 1032763584.0, + "3605": 1045822272.0, + "3610": 1039235200.0, + "3615": 1036870144.0, + "3620": 1026929664.0, + "3625": 1033931136.0, + "3630": 1017582464.0, + "3635": 1026629056.0, + "3640": 1039529088.0, + "3645": 1022655872.0, + "3650": 1036842624.0, + "3655": 1023990144.0, + "3660": 1014987456.0, + "3665": 1026118784.0, + "3670": 1041672448.0, + "3675": 1033250304.0, + "3680": 1015353984.0, + "3685": 1029122304.0, + "3690": 1026204416.0, + "3695": 1043800832.0, + "3700": 1028613504.0, + "3705": 1049485312.0, + "3710": 1027180672.0, + "3715": 1016134912.0, + "3720": 1040818560.0, + "3725": 1032763776.0, + "3730": 1030920960.0, + "3735": 1019008640.0, + "3740": 1023825600.0, + "3745": 1046289152.0, + "3750": 1034462336.0, + "3755": 1032090048.0, + "3760": 1019366912.0, + "3765": 1031916736.0, + "3770": 1026677120.0, + "3775": 1035708288.0, + "3780": 1030671104.0, + "3785": 1027208128.0, + "3790": 1019584064.0, + "3795": 1030306048.0, + "3800": 1035614976.0, + "3805": 1035423360.0, + "3810": 1033294144.0, + "3815": 1033988608.0, + "3820": 1041105792.0, + "3825": 1024534976.0, + "3830": 1037630528.0, + "3835": 1040347968.0, + "3840": 1023445888.0, + "3845": 1048466688.0, + "3850": 1052489280.0, + "3855": 1028907264.0, + "3860": 1019532672.0, + "3865": 1035487744.0, + "3870": 1028491712.0, + "3875": 1041164800.0, + "3880": 1048854912.0, + "3885": 1027725248.0, + "3890": 1027487616.0, + "3895": 1034190592.0, + "3900": 1027645312.0, + "3905": 1027976128.0, + "3910": 1041572480.0, + "3915": 1043995392.0, + "3920": 1041063424.0, + "3925": 1030836160.0, + "3930": 1027072896.0, + "3935": 1033782016.0, + "3940": 1042275712.0, + "3945": 1036248064.0, + "3950": 1021430976.0, + "3955": 1036304128.0, + "3960": 1024184192.0, + "3965": 1027065856.0, + "3970": 1015984640.0, + "3975": 1041421632.0, + "3980": 1032455488.0, + "3985": 1037680640.0, + "3990": 1038684992.0, + "3995": 1023654528.0, + "4000": 1054410240.0, + "4005": 1029983424.0, + "4010": 1025138112.0, + "4015": 1030978560.0, + "4020": 1018472448.0, + "4025": 1027124352.0, + "4030": 1010306816.0, + "4035": 1038641088.0, + "4040": 1022256640.0, + "4045": 1025038208.0, + "4050": 1032348800.0, + "4055": 1022420864.0, + "4060": 1024520768.0, + "4065": 1032871168.0, + "4070": 1027791232.0, + "4075": 1025596928.0, + "4080": 1029366656.0, + "4085": 1020823552.0, + "4090": 1033322496.0, + "4095": 1024142656.0, + "4100": 1040948864.0, + "4105": 1027266496.0, + "4110": 1038791424.0, + "4115": 1023497088.0, + "4120": 1038943168.0, + "4125": 1048274176.0, + "4130": 1021490752.0, + "4135": 1034570880.0, + "4140": 1034613824.0, + "4145": 1044447232.0, + "4150": 1000353664.0, + "4155": 1028363392.0, + "4160": 1024242624.0, + "4165": 1033688704.0, + "4170": 1018888000.0, + "4175": 1026492608.0, + "4180": 1045409024.0, + "4185": 1033631616.0, + "4190": 1029574592.0, + "4195": 1038777984.0, + "4200": 1025102336.0, + "4205": 1019074816.0, + "4210": 1029560704.0, + "4215": 1032269184.0, + "4220": 1026242048.0, + "4225": 1031925888.0, + "4230": 1030269824.0, + "4235": 1027603328.0, + "4240": 1031480832.0, + "4245": 1028765056.0, + "4250": 1026987008.0, + "4255": 1021240064.0, + "4260": 1042082432.0, + "4265": 1025411200.0, + "4270": 1030169984.0, + "4275": 1012472448.0, + "4280": 1044505600.0, + "4285": 1019898304.0, + "4290": 1033058560.0, + "4295": 1033596032.0, + "4300": 1031638912.0, + "4305": 1023847936.0, + "4310": 1021568512.0, + "4315": 1047221504.0, + "4320": 1026520576.0, + "4325": 1005865600.0, + "4330": 1037666688.0, + "4335": 1022006464.0, + "4340": 1029009920.0, + "4345": 1033474496.0, + "4350": 1036886144.0, + "4355": 1026808832.0, + "4360": 1022938240.0, + "4365": 1028779648.0, + "4370": 1029624704.0, + "4375": 1042196864.0, + "4380": 1016100096.0, + "4385": 1045551296.0, + "4390": 1026270848.0, + "4395": 1029796416.0, + "4400": 1047365760.0, + "4405": 1029297344.0, + "4410": 1033424256.0, + "4415": 1028298304.0, + "4420": 1028148928.0, + "4425": 1033575552.0, + "4430": 1031374592.0, + "4435": 1028571136.0, + "4440": 1033123328.0, + "4445": 1028293504.0, + "4450": 1052210944.0, + "4455": 1026286080.0, + "4460": 1034885888.0, + "4465": 1031725696.0, + "4470": 1035446528.0, + "4475": 1036971712.0, + "4480": 1025117824.0, + "4485": 1034104960.0, + "4490": 1024630912.0, + "4495": 1047974912.0, + "4500": 1024707840.0, + "4505": 1038850048.0, + "4510": 1043723776.0, + "4515": 1044276736.0, + "4520": 1036872320.0, + "4525": 1058073536.0, + "4530": 1030973568.0, + "4535": 1032592256.0, + "4540": 1036428160.0, + "4545": 1025726400.0, + "4550": 1021749312.0, + "4555": 1037546112.0, + "4560": 1020099200.0, + "4565": 1036055296.0, + "4570": 1020501120.0, + "4575": 1050412608.0, + "4580": 1010437888.0, + "4585": 1022960768.0, + "4590": 1039710272.0, + "4595": 1023274880.0, + "4600": 1042477824.0, + "4605": 1039746688.0, + "4610": 1046104192.0, + "4615": 1017999744.0, + "4620": 1044734592.0, + "4625": 1030479104.0, + "4630": 1027260800.0, + "4635": 1026995200.0, + "4640": 1034901248.0, + "4645": 1036420352.0, + "4650": 1033711488.0, + "4655": 1035461056.0, + "4660": 1035324800.0, + "4665": 1020265664.0, + "4670": 1020057344.0, + "4675": 1054848768.0, + "4680": 1024895872.0, + "4685": 1027820160.0, + "4690": 1034449664.0, + "4695": 1039151744.0, + "4700": 1038865024.0, + "4705": 1027655808.0, + "4710": 1020522560.0, + "4715": 1031825536.0, + "4720": 1030300416.0, + "4725": 1030298368.0, + "4730": 1044096704.0, + "4735": 1046133376.0, + "4740": 1036178112.0, + "4745": 1039043840.0, + "4750": 1031790528.0, + "4755": 1047723392.0, + "4760": 1026178176.0, + "4765": 1034695040.0, + "4770": 1036521856.0, + "4775": 1029375168.0, + "4780": 1028543488.0, + "4785": 1028414976.0, + "4790": 1019620224.0, + "4795": 1033060160.0, + "4800": 1051866880.0, + "4805": 1015414400.0, + "4810": 1029454336.0, + "4815": 1009572096.0, + "4820": 1041051200.0, + "4825": 1026708608.0, + "4830": 1020450816.0, + "4835": 1051307840.0, + "4840": 1019456512.0, + "4845": 1032315008.0, + "4850": 1036794496.0, + "4855": 1031052736.0, + "4860": 1033131776.0, + "4865": 1032064384.0, + "4870": 1049832576.0, + "4875": 1025110528.0, + "4880": 1048476160.0, + "4885": 1016853056.0, + "4890": 1037317312.0, + "4895": 1024323136.0, + "4900": 1043374208.0, + "4905": 1033397120.0, + "4910": 1032830272.0, + "4915": 1016889856.0, + "4920": 1022294784.0, + "4925": 1034965888.0, + "4930": 1034630016.0, + "4935": 1025885312.0, + "4940": 1048398272.0, + "4945": 1025248576.0, + "4950": 1024208768.0, + "4955": 1007485952.0, + "4960": 1040213824.0, + "4965": 1018775296.0, + "4970": 1014274688.0, + "4975": 1038025472.0, + "4980": 1020917888.0, + "4985": 1029045888.0, + "4990": 1028394816.0, + "4995": 1032020480.0, + "5000": 1039791104.0, + "5005": 1024351552.0, + "5010": 1029147968.0, + "5015": 1021807296.0, + "5020": 1023506944.0, + "5025": 1037603456.0, + "5030": 1041947136.0, + "5035": 1047130304.0, + "5040": 1060956096.0, + "5045": 1032108544.0, + "5050": 1029534336.0, + "5055": 1024552192.0, + "5060": 1035282304.0, + "5065": 1021205504.0, + "5070": 1035756288.0, + "5075": 1015771264.0, + "5080": 1027040064.0, + "5085": 1021792192.0, + "5090": 1034973568.0, + "5095": 1015499712.0, + "5100": 1032257600.0, + "5105": 1017981568.0, + "5110": 1019586304.0, + "5115": 1036063936.0, + "5120": 1032695040.0, + "5125": 1019076992.0, + "5130": 1033404672.0, + "5135": 1041203072.0, + "5140": 1026258752.0, + "5145": 1033705856.0, + "5150": 1022043520.0, + "5155": 1032265664.0, + "5160": 1039625984.0, + "5165": 1031576448.0, + "5170": 1035555328.0, + "5175": 1026116224.0, + "5180": 1030316032.0, + "5185": 1024495680.0, + "5190": 1019492608.0, + "5195": 1035626496.0, + "5200": 1016905344.0, + "5205": 1013435648.0, + "5210": 1049395456.0, + "5215": 1030833280.0, + "5220": 1025276800.0, + "5225": 1035239936.0, + "5230": 1025930624.0, + "5235": 1025120000.0, + "5240": 1046308224.0, + "5245": 1022740608.0, + "5250": 1027062336.0, + "5255": 1023887360.0, + "5260": 1033821440.0, + "5265": 1045733696.0, + "5270": 1052500480.0, + "5275": 1033018112.0, + "5280": 1030073920.0, + "5285": 1025212608.0, + "5290": 1026575616.0, + "5295": 1032653440.0, + "5300": 1024367872.0, + "5305": 1029634368.0, + "5310": 1033197312.0, + "5315": 1032988992.0, + "5320": 1019521664.0, + "5325": 1022718336.0, + "5330": 1021335168.0, + "5335": 1039275776.0, + "5340": 1037219648.0, + "5345": 1039188096.0, + "5350": 1023701888.0, + "5355": 1029935872.0, + "5360": 1047046080.0, + "5365": 1037426432.0, + "5370": 1024381568.0, + "5375": 1042070656.0, + "5380": 1020368384.0, + "5385": 1021765696.0, + "5390": 1035133184.0, + "5395": 1049653568.0, + "5400": 1026015744.0, + "5405": 1036453120.0, + "5410": 1027635776.0, + "5415": 1042285824.0, + "5420": 1039941888.0, + "5425": 1028381184.0, + "5430": 1043799808.0, + "5435": 1032653312.0, + "5440": 1033384448.0, + "5445": 1034144640.0, + "5450": 1025299328.0, + "5455": 1034079424.0, + "5460": 1026812416.0, + "5465": 1027399552.0, + "5470": 1028969216.0, + "5475": 1037233920.0, + "5480": 1023830272.0, + "5485": 1019186752.0, + "5490": 1030891520.0, + "5495": 1029399424.0, + "5500": 1032681216.0, + "5505": 1018275200.0, + "5510": 1023987648.0, + "5515": 1025156032.0, + "5520": 1039527296.0, + "5525": 1018024576.0, + "5530": 1037663936.0, + "5535": 1031599232.0, + "5540": 1027564544.0, + "5545": 1033212160.0, + "5550": 1032115968.0, + "5555": 1044802304.0, + "5560": 1028511232.0, + "5565": 1029686016.0, + "5570": 1042027776.0, + "5575": 1025379392.0, + "5580": 1023716736.0, + "5585": 1044093696.0, + "5590": 1041319936.0, + "5595": 1031549824.0, + "5600": 1023400320.0, + "5605": 1040115456.0, + "5610": 1034087552.0, + "5615": 1021042816.0, + "5620": 1031004800.0, + "5625": 1030188544.0, + "5630": 1023502080.0, + "5635": 1026684096.0, + "5640": 1034589120.0, + "5645": 1018655744.0, + "5650": 1052378752.0, + "5655": 1048933504.0, + "5660": 1050077696.0, + "5665": 1033958144.0, + "5670": 1033750016.0, + "5675": 1025392640.0, + "5680": 1039378304.0, + "5685": 1033056576.0, + "5690": 1031464576.0, + "5695": 1021946368.0, + "5700": 1038065664.0, + "5705": 1043684736.0, + "5710": 1057231616.0, + "5715": 1014462848.0, + "5720": 1021258816.0, + "5725": 1041822272.0, + "5730": 1039454912.0, + "5735": 1025128576.0, + "5740": 1026045440.0, + "5745": 1036990208.0, + "5750": 1044552256.0, + "5755": 1011860416.0, + "5760": 1028389568.0, + "5765": 1028245504.0, + "5770": 1021530368.0, + "5775": 1051210240.0, + "5780": 1034984512.0, + "5785": 1037513920.0, + "5790": 1016957184.0, + "5795": 1027873536.0, + "5800": 1029780736.0, + "5805": 1050694912.0, + "5810": 1018478336.0, + "5815": 1036123520.0, + "5820": 1048408704.0, + "5825": 1030977920.0, + "5830": 1031572096.0, + "5835": 1034045440.0, + "5840": 1039843776.0, + "5845": 1021746048.0, + "5850": 1029807744.0, + "5855": 1038789376.0, + "5860": 1031436288.0, + "5865": 1026397568.0, + "5870": 1029861824.0, + "5875": 1032841856.0, + "5880": 1032675968.0, + "5885": 1024576128.0, + "5890": 1026798976.0, + "5895": 1015796160.0, + "5900": 1049707008.0, + "5905": 1025653248.0, + "5910": 1019150720.0, + "5915": 1042739136.0, + "5920": 1028047232.0, + "5925": 1034016448.0, + "5930": 1030963328.0, + "5935": 1038102784.0, + "5940": 1019172864.0, + "5945": 1025130112.0, + "5950": 1035530240.0, + "5955": 1050437184.0, + "5960": 1024548736.0, + "5965": 1029923712.0, + "5970": 1016427776.0, + "5975": 1036682752.0, + "5980": 1024118464.0, + "5985": 1035386624.0, + "5990": 1010550784.0, + "5995": 1047019200.0, + "6000": 1021245568.0, + "6005": 1040460416.0, + "6010": 1025358720.0, + "6015": 1050179072.0, + "6020": 1039514496.0, + "6025": 1030254592.0, + "6030": 1025931968.0, + "6035": 1021745408.0, + "6040": 1034117056.0, + "6045": 1028282112.0, + "6050": 1020112320.0, + "6055": 1040397056.0, + "6060": 1026347008.0, + "6065": 1022198400.0, + "6070": 1040668416.0, + "6075": 1046037440.0, + "6080": 1038583168.0, + "6085": 1041485568.0, + "6090": 1037205888.0, + "6095": 1036282880.0, + "6100": 1030454720.0, + "6105": 1019216640.0, + "6110": 1035357824.0, + "6115": 1019452544.0, + "6120": 1032188800.0, + "6125": 1020922624.0, + "6130": 1012013952.0, + "6135": 1038733824.0, + "6140": 1041736896.0, + "6145": 1041917056.0, + "6150": 1018958208.0, + "6155": 1024649344.0, + "6160": 1047972160.0, + "6165": 1050408832.0, + "6170": 1032505344.0, + "6175": 1045793664.0, + "6180": 1040067072.0, + "6185": 1029710464.0, + "6190": 1023293760.0, + "6195": 1050897728.0, + "6200": 1035035776.0, + "6205": 1036275584.0, + "6210": 1039772736.0, + "6215": 1033200256.0, + "6220": 1026162432.0, + "6225": 1036741120.0, + "6230": 1025144192.0, + "6235": 1019352832.0, + "6240": 1057104384.0, + "6245": 1018413952.0, + "6250": 1035337344.0, + "6255": 1025380992.0, + "6260": 1034863744.0, + "6265": 1027703424.0, + "6270": 1042116480.0, + "6275": 1037659008.0, + "6280": 1018270208.0, + "6285": 1032642304.0, + "6290": 1038598592.0, + "6295": 1031803456.0, + "6300": 1034635200.0, + "6305": 1011066624.0, + "6310": 1039458624.0, + "6315": 1030054272.0, + "6320": 1030534208.0, + "6325": 1038642496.0, + "6330": 1033908800.0, + "6335": 1032297856.0, + "6340": 1033544448.0, + "6345": 1031036416.0, + "6350": 1037451264.0, + "6355": 1028075968.0, + "6360": 1043313408.0, + "6365": 1025223808.0, + "6370": 1033939200.0, + "6375": 1036038720.0, + "6380": 1029108096.0, + "6385": 1025395072.0, + "6390": 1025517952.0, + "6395": 1048611584.0, + "6400": 1040734976.0, + "6405": 1024247936.0, + "6410": 1017489280.0, + "6415": 1042827072.0, + "6420": 1025202432.0, + "6425": 1027164928.0, + "6430": 1040568256.0, + "6435": 1022908800.0, + "6440": 1047994624.0, + "6445": 1036089088.0, + "6450": 1048532224.0, + "6455": 1037272320.0, + "6460": 1036750912.0, + "6465": 1033652032.0, + "6470": 1018135232.0, + "6475": 1034691648.0, + "6480": 1028994048.0, + "6485": 1033258880.0, + "6490": 1035638656.0, + "6495": 1024470016.0, + "6500": 1020572096.0, + "6505": 1059327104.0, + "6510": 1020472576.0, + "6515": 1018688064.0, + "6520": 1051470592.0, + "6525": 1035544512.0, + "6530": 1027897216.0, + "6535": 1022722240.0, + "6540": 1023273984.0, + "6545": 1033173120.0, + "6550": 1029488512.0, + "6555": 1029575296.0, + "6560": 1056438784.0, + "6565": 1054295040.0, + "6570": 1032319040.0, + "6575": 1041208320.0, + "6580": 1028134400.0, + "6585": 1036504832.0, + "6590": 1042456192.0, + "6595": 1038568832.0, + "6600": 1031388096.0, + "6605": 1045715456.0, + "6610": 1034713472.0, + "6615": 1015576448.0, + "6620": 1039115136.0, + "6625": 1054654208.0, + "6630": 1043092928.0, + "6635": 1032226304.0, + "6640": 1016738496.0, + "6645": 1016178816.0, + "6650": 1034692672.0, + "6655": 1031753472.0, + "6660": 1041401920.0, + "6665": 1024657984.0, + "6670": 1023820032.0, + "6675": 1038306176.0, + "6680": 1025624064.0, + "6685": 1045394048.0, + "6690": 1046390720.0, + "6695": 1027754368.0, + "6700": 1033473920.0, + "6705": 1038857152.0, + "6710": 1047485888.0, + "6715": 1043229440.0, + "6720": 1022995456.0, + "6725": 1018910144.0, + "6730": 1027525504.0, + "6735": 1016937856.0, + "6740": 1027238016.0, + "6745": 1030263680.0, + "6750": 1006373760.0, + "6755": 1034765056.0, + "6760": 1040735296.0, + "6765": 1023827008.0, + "6770": 1036441344.0, + "6775": 1019627712.0, + "6780": 1043723904.0, + "6785": 1037409280.0, + "6790": 1029403072.0, + "6795": 1026349440.0, + "6800": 1036628224.0, + "6805": 1024579712.0, + "6810": 1042340544.0, + "6815": 1035274112.0, + "6820": 1022594880.0, + "6825": 1034793344.0, + "6830": 1029862400.0, + "6835": 1041609600.0, + "6840": 1042283776.0, + "6845": 1018954624.0, + "6850": 1032171136.0, + "6855": 1034434752.0, + "6860": 1042054848.0, + "6865": 1021813568.0, + "6870": 1037015424.0, + "6875": 1030379968.0, + "6880": 1029360768.0, + "6885": 1030435968.0, + "6890": 1039890432.0, + "6895": 1027267712.0, + "6900": 1035174016.0, + "6905": 1043975424.0, + "6910": 1019763072.0, + "6915": 1017476608.0, + "6920": 1017184256.0, + "6925": 1030650688.0, + "6930": 1036672384.0, + "6935": 1042835712.0, + "6940": 1040313216.0, + "6945": 1044196992.0, + "6950": 1040513472.0, + "6955": 1036112704.0, + "6960": 1036436224.0, + "6965": 1019161024.0, + "6970": 1034729088.0, + "6975": 1019134464.0, + "6980": 1028436160.0, + "6985": 1023240128.0, + "6990": 1026994688.0, + "6995": 1027547520.0, + "7000": 1058819840.0, + "7005": 1013737856.0, + "7010": 1028959488.0, + "7015": 1037288768.0, + "7020": 1011880576.0, + "7025": 1017313280.0, + "7030": 1028301440.0, + "7035": 1035955392.0, + "7040": 1042966016.0, + "7045": 1028185856.0, + "7050": 1017979584.0, + "7055": 1035088000.0, + "7060": 1051802624.0, + "7065": 1007664640.0, + "7070": 1035819008.0, + "7075": 1031039552.0, + "7080": 1026143296.0, + "7085": 1044906432.0, + "7090": 1046261760.0, + "7095": 1043760512.0, + "7100": 1035089024.0, + "7105": 1049143296.0, + "7110": 1010962944.0, + "7115": 1033869504.0, + "7120": 1031267456.0, + "7125": 1037496832.0, + "7130": 1024881856.0, + "7135": 1031991808.0, + "7140": 1019090176.0, + "7145": 1033081088.0, + "7150": 1037554112.0, + "7155": 1015729728.0, + "7160": 1024724608.0, + "7165": 1030895808.0, + "7170": 1037367808.0, + "7175": 1028816896.0, + "7180": 1037633280.0, + "7185": 1016174080.0, + "7190": 1019808128.0, + "7195": 1040915392.0, + "7200": 1041375360.0, + "7205": 1026538240.0, + "7210": 1022638720.0, + "7215": 1041890560.0, + "7220": 1017742720.0, + "7225": 1027296640.0, + "7230": 1030200448.0, + "7235": 1035726848.0, + "7240": 1037854848.0, + "7245": 1023971008.0, + "7250": 1044708096.0, + "7255": 1031900480.0, + "7260": 1030128256.0, + "7265": 1036887104.0, + "7270": 1050097152.0, + "7275": 1029225216.0, + "7280": 1020231808.0, + "7285": 1029842048.0, + "7290": 1017219328.0, + "7295": 1029139584.0, + "7300": 1031533824.0, + "7305": 1027298176.0, + "7310": 1029089664.0, + "7315": 1022782272.0, + "7320": 1036458176.0, + "7325": 1036851840.0, + "7330": 1021706496.0, + "7335": 1030715904.0, + "7340": 1039382976.0, + "7345": 1040177664.0, + "7350": 1034973568.0, + "7355": 1033656320.0, + "7360": 1031254912.0, + "7365": 1048742016.0, + "7370": 1027298304.0, + "7375": 1041854848.0, + "7380": 1016725760.0, + "7385": 1017578368.0, + "7390": 1017234944.0, + "7395": 1046793600.0, + "7400": 1048441216.0, + "7405": 1013394304.0, + "7410": 1017386368.0, + "7415": 1017815360.0, + "7420": 1028043008.0, + "7425": 1012840576.0, + "7430": 1034042368.0, + "7435": 1032530432.0, + "7440": 1002692928.0, + "7445": 1034451200.0, + "7450": 1039304832.0, + "7455": 1019027008.0, + "7460": 1014740928.0, + "7465": 1027204736.0, + "7470": 1030422784.0, + "7475": 1033792064.0, + "7480": 1043317376.0, + "7485": 1038215168.0, + "7490": 1049000960.0, + "7495": 1028982720.0, + "7500": 1027426816.0, + "7505": 1028695936.0, + "7510": 1048886528.0, + "7515": 1035648704.0, + "7520": 1017198848.0, + "7525": 1036572736.0, + "7530": 1029261952.0, + "7535": 1027190144.0, + "7540": 1028338048.0, + "7545": 1025986304.0, + "7550": 1023025856.0, + "7555": 1033025344.0, + "7560": 1031404672.0, + "7565": 1022710528.0, + "7570": 1037591552.0, + "7575": 1022603136.0, + "7580": 1018123584.0, + "7585": 1033054208.0, + "7590": 1010993280.0, + "7595": 1018260352.0, + "7600": 1049904448.0, + "7605": 1037361216.0, + "7610": 1040415744.0, + "7615": 1035247488.0, + "7620": 1024230912.0, + "7625": 1020317184.0, + "7630": 1034939584.0, + "7635": 1043224192.0, + "7640": 1033491520.0, + "7645": 1034444608.0, + "7650": 1039804800.0, + "7655": 1031240576.0, + "7660": 1056628096.0, + "7665": 1031076096.0, + "7670": 1033685120.0, + "7675": 1030681600.0, + "7680": 1035398720.0, + "7685": 1018661760.0, + "7690": 1031921024.0, + "7695": 1025858880.0, + "7700": 1017715200.0, + "7705": 1036531200.0, + "7710": 1029893248.0, + "7715": 1053230656.0, + "7720": 1019514240.0, + "7725": 1042193216.0, + "7730": 1035620992.0, + "7735": 1020726144.0, + "7740": 1045576128.0, + "7745": 1026932992.0, + "7750": 1048550208.0, + "7755": 1022539264.0, + "7760": 1049532032.0, + "7765": 1029370176.0, + "7770": 1018375296.0, + "7775": 1021364672.0, + "7780": 1039770624.0, + "7785": 1039914112.0, + "7790": 1030516992.0, + "7795": 1039353728.0, + "7800": 1028187904.0, + "7805": 1027635776.0, + "7810": 1020970368.0, + "7815": 1035878400.0, + "7820": 1017666240.0, + "7825": 1018067392.0, + "7830": 1035104128.0, + "7835": 1044507648.0, + "7840": 1027836224.0, + "7845": 1032101504.0, + "7850": 1034609408.0, + "7855": 1025464832.0, + "7860": 1059051648.0, + "7865": 1016626240.0, + "7870": 1033729408.0, + "7875": 1044185600.0, + "7880": 1029084352.0, + "7885": 1040308288.0, + "7890": 1029556480.0, + "7895": 1032947008.0, + "7900": 1021409216.0, + "7905": 1020955904.0, + "7910": 1008993856.0, + "7915": 1023120768.0, + "7920": 1023070976.0, + "7925": 1030094080.0, + "7930": 1020712704.0, + "7935": 1019443776.0, + "7940": 1017809152.0, + "7945": 1014447552.0, + "7950": 1026303616.0, + "7955": 1034518272.0, + "7960": 1056026304.0, + "7965": 1031047872.0, + "7970": 1030417152.0, + "7975": 1022189888.0, + "7980": 1034474624.0, + "7985": 1047305024.0, + "7990": 1032066176.0, + "7995": 1044264704.0, + "8000": 1028876672.0, + "8005": 1028045440.0, + "8010": 1050665408.0, + "8015": 1019758976.0, + "8020": 1043297408.0, + "8025": 1039018560.0, + "8030": 1030868800.0, + "8035": 1045304192.0, + "8040": 1026310784.0, + "8045": 1024970368.0, + "8050": 1018405632.0, + "8055": 1033736960.0, + "8060": 1012986816.0, + "8065": 1022016640.0, + "8070": 1034776064.0, + "8075": 1042759616.0, + "8080": 1027758784.0, + "8085": 1037205376.0, + "8090": 1007008256.0, + "8095": 1030374528.0, + "8100": 1030726016.0, + "8105": 1027794944.0, + "8110": 1031557248.0, + "8115": 1037685248.0, + "8120": 1037692992.0, + "8125": 1031097472.0, + "8130": 1028627072.0, + "8135": 1029680256.0, + "8140": 1049904256.0, + "8145": 1043463552.0, + "8150": 1040087424.0, + "8155": 1046780288.0, + "8160": 1010199040.0, + "8165": 1031657728.0, + "8170": 1024483264.0, + "8175": 1035019648.0, + "8180": 1024460544.0, + "8185": 1021960448.0, + "8190": 1037125504.0, + "8195": 1022368384.0, + "8200": 1035635968.0, + "8205": 1026482496.0, + "8210": 1023888000.0, + "8215": 1014276416.0, + "8220": 1026756224.0, + "8225": 1028540160.0, + "8230": 1027163072.0, + "8235": 1037914048.0, + "8240": 1025909376.0, + "8245": 1024676608.0, + "8250": 1041635840.0, + "8255": 1031908224.0, + "8260": 1032424512.0, + "8265": 1023164800.0, + "8270": 1040172544.0, + "8275": 1038050688.0, + "8280": 1041849216.0, + "8285": 1038804352.0, + "8290": 1024074880.0, + "8295": 1028403648.0, + "8300": 1039341440.0, + "8305": 1012104192.0, + "8310": 1021882048.0, + "8315": 1027307200.0, + "8320": 1021636992.0, + "8325": 1048572160.0, + "8330": 1041039616.0, + "8335": 1037964928.0, + "8340": 1033019136.0, + "8345": 1043864192.0, + "8350": 1037713792.0, + "8355": 1029686400.0, + "8360": 1040667776.0, + "8365": 1027450304.0, + "8370": 1037742848.0, + "8375": 1041986944.0, + "8380": 1037628416.0, + "8385": 1023436160.0, + "8390": 1026068224.0, + "8395": 1028913408.0, + "8400": 1046530560.0, + "8405": 1040179456.0, + "8410": 1034252672.0, + "8415": 1040258688.0, + "8420": 1054730752.0, + "8425": 1031514880.0, + "8430": 1030295680.0, + "8435": 1045707200.0, + "8440": 1026310784.0, + "8445": 1029027392.0, + "8450": 1034201920.0, + "8455": 1031794688.0, + "8460": 1016828032.0, + "8465": 1035163648.0, + "8470": 1035185152.0, + "8475": 1024712960.0, + "8480": 1035901184.0, + "8485": 1028948480.0, + "8490": 1023079168.0, + "8495": 1037393280.0, + "8500": 1025960064.0, + "8505": 1042724992.0, + "8510": 1028167936.0, + "8515": 1038101056.0, + "8520": 1023107328.0, + "8525": 1037987328.0, + "8530": 1027572800.0, + "8535": 1041656128.0, + "8540": 1033880960.0, + "8545": 1015116160.0, + "8550": 1040188160.0, + "8555": 1016340672.0, + "8560": 1019330048.0, + "8565": 1021410112.0, + "8570": 1032032320.0, + "8575": 1031880128.0, + "8580": 1016011264.0, + "8585": 1030017408.0, + "8590": 1031637248.0, + "8595": 1017776128.0, + "8600": 1002393216.0, + "8605": 1030238336.0, + "8610": 1017532288.0, + "8615": 1023989248.0, + "8620": 1047205696.0, + "8625": 1034231552.0, + "8630": 1030921280.0, + "8635": 1051992512.0, + "8640": 1041134208.0, + "8645": 1024870720.0, + "8650": 1025595392.0, + "8655": 1036904832.0, + "8660": 1031171200.0, + "8665": 1032904640.0, + "8670": 1037400576.0, + "8675": 1029157248.0, + "8680": 1031264704.0, + "8685": 1041197568.0, + "8690": 1035035392.0, + "8695": 1008508416.0, + "8700": 1027459072.0, + "8705": 1051504896.0, + "8710": 1041678016.0, + "8715": 1034152256.0, + "8720": 1017596544.0, + "8725": 1025187456.0, + "8730": 1036610816.0, + "8735": 1014829568.0, + "8740": 1036081536.0, + "8745": 1021252416.0, + "8750": 1027866496.0, + "8755": 1020742272.0, + "8760": 1036899712.0, + "8765": 1058672448.0, + "8770": 1020462464.0, + "8775": 1031773056.0, + "8780": 1030892544.0, + "8785": 1032117504.0, + "8790": 1041034112.0, + "8795": 1019523968.0, + "8800": 1038245632.0, + "8805": 1035106752.0, + "8810": 1043257088.0, + "8815": 1026490496.0, + "8820": 1027666944.0, + "8825": 1043464064.0, + "8830": 1027480192.0, + "8835": 1038812928.0, + "8840": 1034490752.0, + "8845": 1033909760.0, + "8850": 1030491008.0, + "8855": 1042524992.0, + "8860": 1013002880.0, + "8865": 1038368128.0, + "8870": 1025187456.0, + "8875": 1012981760.0, + "8880": 1028376704.0, + "8885": 1046461056.0, + "8890": 1038603840.0, + "8895": 1037909504.0, + "8900": 1027294848.0, + "8905": 1032792064.0, + "8910": 1029795264.0, + "8915": 1030003968.0, + "8920": 1030339968.0, + "8925": 1028569984.0, + "8930": 1031637376.0, + "8935": 1022951424.0, + "8940": 1019847872.0, + "8945": 1031909248.0, + "8950": 1039951744.0, + "8955": 1041902720.0, + "8960": 1026878464.0, + "8965": 1022083968.0, + "8970": 1029559424.0, + "8975": 1038934400.0, + "8980": 1033860160.0, + "8985": 1030649472.0, + "8990": 1025014144.0, + "8995": 1013963648.0, + "9000": 1035286400.0, + "9005": 1028649280.0, + "9010": 1011913280.0, + "9015": 1038912128.0, + "9020": 1030153856.0, + "9025": 1024685056.0, + "9030": 1025861888.0, + "9035": 1054309248.0, + "9040": 1027293952.0, + "9045": 1036583040.0, + "9050": 1020929664.0, + "9055": 1043212800.0, + "9060": 1023159104.0, + "9065": 1023387520.0, + "9070": 1039364480.0, + "9075": 1026728320.0, + "9080": 1018873408.0, + "9085": 1015439104.0, + "9090": 1043764736.0, + "9095": 1014020224.0, + "9100": 1031975296.0, + "9105": 1026514304.0, + "9110": 1029229568.0, + "9115": 1024866432.0, + "9120": 999986240.0, + "9125": 1032842752.0, + "9130": 1038534336.0, + "9135": 1031037696.0, + "9140": 1025502208.0, + "9145": 1030405248.0, + "9150": 1029416576.0, + "9155": 1038268928.0, + "9160": 1046043904.0, + "9165": 1017948992.0, + "9170": 1040955520.0, + "9175": 1031287552.0, + "9180": 1037830656.0, + "9185": 1040684416.0, + "9190": 1028985728.0, + "9195": 1034312320.0, + "9200": 1035551872.0, + "9205": 1029847040.0, + "9210": 1026535872.0, + "9215": 1030520448.0, + "9220": 1025732224.0, + "9225": 1048001408.0, + "9230": 1041601792.0, + "9235": 1027775104.0, + "9240": 1025245760.0, + "9245": 1036211584.0, + "9250": 1041192384.0, + "9255": 1020063872.0, + "9260": 1035337984.0, + "9265": 1023102208.0, + "9270": 1038332928.0, + "9275": 1036053568.0, + "9280": 1026541504.0, + "9285": 1014285184.0, + "9290": 1018866304.0, + "9295": 1026915264.0, + "9300": 1037085888.0, + "9305": 1045435392.0, + "9310": 1033242944.0, + "9315": 1039043840.0, + "9320": 1048495488.0, + "9325": 1023059840.0, + "9330": 1031724672.0, + "9335": 1035673472.0, + "9340": 1013719296.0, + "9345": 1022572032.0, + "9350": 1026585600.0, + "9355": 1034807104.0, + "9360": 1029839552.0, + "9365": 1019863296.0, + "9370": 1006904320.0, + "9375": 1036232960.0, + "9380": 1049012736.0, + "9385": 1015905344.0, + "9390": 1029208704.0, + "9395": 1008931968.0, + "9400": 1026893568.0, + "9405": 1027653312.0, + "9410": 1040913280.0, + "9415": 1035128576.0, + "9420": 1030792640.0, + "9425": 1027581056.0, + "9430": 1032727360.0, + "9435": 1031796288.0, + "9440": 1051730048.0, + "9445": 1019626752.0, + "9450": 1044505152.0, + "9455": 1035773696.0, + "9460": 1013828224.0, + "9465": 1023403904.0, + "9470": 1023576832.0, + "9475": 1039164416.0, + "9480": 1029597056.0, + "9485": 1032075200.0, + "9490": 1020994560.0, + "9495": 1021375616.0, + "9500": 1035594304.0, + "9505": 1034478464.0, + "9510": 1014286592.0, + "9515": 1031309312.0, + "9520": 1026563904.0, + "9525": 1035853184.0, + "9530": 1031624448.0, + "9535": 1025926720.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 33307314176.0, + "5": 33307424768.0, + "10": 33307447296.0, + "15": 33307439104.0, + "20": 33307533312.0, + "25": 33307473920.0, + "30": 33307504640.0, + "35": 33307639808.0, + "40": 33307637760.0, + "45": 33307568128.0, + "50": 33307418624.0, + "55": 33307326464.0, + "60": 33307346944.0, + "65": 33307490304.0, + "70": 33307312128.0, + "75": 33307308032.0, + "80": 33307404288.0, + "85": 33307314176.0, + "90": 33307285504.0, + "95": 33307392000.0, + "100": 33307260928.0, + "105": 33307129856.0, + "110": 33307037696.0, + "115": 33306703872.0, + "120": 33307355136.0, + "125": 33306873856.0, + "130": 33307017216.0, + "135": 33307305984.0, + "140": 33307004928.0, + "145": 33307121664.0, + "150": 33307312128.0, + "155": 33307176960.0, + "160": 33307103232.0, + "165": 33307174912.0, + "170": 33307832320.0, + "175": 33307199488.0, + "180": 33307355136.0, + "185": 33307355136.0, + "190": 33307131904.0, + "195": 33307256832.0, + "200": 33307326464.0, + "205": 33307492352.0, + "210": 33307500544.0, + "215": 33307086848.0, + "220": 33306857472.0, + "225": 33306933248.0, + "230": 33307092992.0, + "235": 33307183104.0, + "240": 33307303936.0, + "245": 33307426816.0, + "250": 33307308032.0, + "255": 33307295744.0, + "260": 33306767360.0, + "265": 33307461632.0, + "270": 33307467776.0, + "275": 33307469824.0, + "280": 33307254784.0, + "285": 33307947008.0, + "290": 33307191296.0, + "295": 33308014592.0, + "300": 33307856896.0, + "305": 33308340224.0, + "310": 33307815936.0, + "315": 33307181056.0, + "320": 33307512832.0, + "325": 33307488256.0, + "330": 33307977728.0, + "335": 33307947008.0, + "340": 33308606464.0, + "345": 33308037120.0, + "350": 33307693056.0, + "355": 33308000256.0, + "360": 33307348992.0, + "365": 33307451392.0, + "370": 33308000256.0, + "375": 33307283456.0, + "380": 33307570176.0, + "385": 33307860992.0, + "390": 33307416576.0, + "395": 33307031552.0, + "400": 33307246592.0, + "405": 33307676672.0, + "410": 33306935296.0, + "415": 33307752448.0, + "420": 33307529216.0, + "425": 33307314176.0, + "430": 33306988544.0, + "435": 33307455488.0, + "440": 33307369472.0, + "445": 33307709440.0, + "450": 33307588608.0, + "455": 33306963968.0, + "460": 33307193344.0, + "465": 33306845184.0, + "470": 33307766784.0, + "475": 33306464256.0, + "480": 33307566080.0, + "485": 33307682816.0, + "490": 33307389952.0, + "495": 33307179008.0, + "500": 33307969536.0, + "505": 33307629568.0, + "510": 33308192768.0, + "515": 33307279360.0, + "520": 33306544128.0, + "525": 33307265024.0, + "530": 33307025408.0, + "535": 33307648000.0, + "540": 33307582464.0, + "545": 33307297792.0, + "550": 33307396096.0, + "555": 33307301888.0, + "560": 33307899904.0, + "565": 33307379712.0, + "570": 33307553792.0, + "575": 33307136000.0, + "580": 33305892864.0, + "585": 33306945536.0, + "590": 33307629568.0, + "595": 33307860992.0, + "600": 33306873856.0, + "605": 33307357184.0, + "610": 33306556416.0, + "615": 33306349568.0, + "620": 33307791360.0, + "625": 33306378240.0, + "630": 33307168768.0, + "635": 33306767360.0, + "640": 33306116096.0, + "645": 33308092416.0, + "650": 33307277312.0, + "655": 33307131904.0, + "660": 33308485632.0, + "665": 33307334656.0, + "670": 33307959296.0, + "675": 33307701248.0, + "680": 33306863616.0, + "685": 33306697728.0, + "690": 33307863040.0, + "695": 33307293696.0, + "700": 33306263552.0, + "705": 33306955776.0, + "710": 33308225536.0, + "715": 33307174912.0, + "720": 33307107328.0, + "725": 33307324416.0, + "730": 33308231680.0, + "735": 33307224064.0, + "740": 33307815936.0, + "745": 33307938816.0, + "750": 33307779072.0, + "755": 33308463104.0, + "760": 33306349568.0, + "765": 33308266496.0, + "770": 33306603520.0, + "775": 33307424768.0, + "780": 33308608512.0, + "785": 33307969536.0, + "790": 33308188672.0, + "795": 33307656192.0, + "800": 33307547648.0, + "805": 33307619328.0, + "810": 33307910144.0, + "815": 33307170816.0, + "820": 33307029504.0, + "825": 33307443200.0, + "830": 33307422720.0, + "835": 33307262976.0, + "840": 33307613184.0, + "845": 33307928576.0, + "850": 33306238976.0, + "855": 33307396096.0, + "860": 33307938816.0, + "865": 33307701248.0, + "870": 33307940864.0, + "875": 33307545600.0, + "880": 33307527168.0, + "885": 33307336704.0, + "890": 33308262400.0, + "895": 33307717632.0, + "900": 33306474496.0, + "905": 33307480064.0, + "910": 33307725824.0, + "915": 33308303360.0, + "920": 33307770880.0, + "925": 33307566080.0, + "930": 33307451392.0, + "935": 33307975680.0, + "940": 33306320896.0, + "945": 33306429440.0, + "950": 33307136000.0, + "955": 33307846656.0, + "960": 33307611136.0, + "965": 33307465728.0, + "970": 33308293120.0, + "975": 33307078656.0, + "980": 33307568128.0, + "985": 33307080704.0, + "990": 33307367424.0, + "995": 33306861568.0, + "1000": 33307889664.0, + "1005": 33305956352.0, + "1010": 33307508736.0, + "1015": 33306671104.0, + "1020": 33306669056.0, + "1025": 33306509312.0, + "1030": 33307117568.0, + "1035": 33308332032.0, + "1040": 33307353088.0, + "1045": 33308368896.0, + "1050": 33306615808.0, + "1055": 33306802176.0, + "1060": 33307103232.0, + "1065": 33307404288.0, + "1070": 33307070464.0, + "1075": 33308188672.0, + "1080": 33307011072.0, + "1085": 33307027456.0, + "1090": 33308086272.0, + "1095": 33307086848.0, + "1100": 33307287552.0, + "1105": 33308497920.0, + "1110": 33307461632.0, + "1115": 33307533312.0, + "1120": 33307777024.0, + "1125": 33307809792.0, + "1130": 33307484160.0, + "1135": 33308082176.0, + "1140": 33307029504.0, + "1145": 33307432960.0, + "1150": 33307574272.0, + "1155": 33307551744.0, + "1160": 33307561984.0, + "1165": 33307086848.0, + "1170": 33307856896.0, + "1175": 33306976256.0, + "1180": 33308237824.0, + "1185": 33307875328.0, + "1190": 33307369472.0, + "1195": 33308231680.0, + "1200": 33307197440.0, + "1205": 33307480064.0, + "1210": 33305866240.0, + "1215": 33308297216.0, + "1220": 33307451392.0, + "1225": 33307518976.0, + "1230": 33307688960.0, + "1235": 33307901952.0, + "1240": 33307394048.0, + "1245": 33307842560.0, + "1250": 33307281408.0, + "1255": 33306906624.0, + "1260": 33307301888.0, + "1265": 33307674624.0, + "1270": 33307150336.0, + "1275": 33307686912.0, + "1280": 33307430912.0, + "1285": 33306974208.0, + "1290": 33307529216.0, + "1295": 33307901952.0, + "1300": 33307002880.0, + "1305": 33308059648.0, + "1310": 33306939392.0, + "1315": 33307336704.0, + "1320": 33307262976.0, + "1325": 33307011072.0, + "1330": 33306550272.0, + "1335": 33307181056.0, + "1340": 33307406336.0, + "1345": 33307463680.0, + "1350": 33308135424.0, + "1355": 33307480064.0, + "1360": 33307533312.0, + "1365": 33307066368.0, + "1370": 33306595328.0, + "1375": 33307891712.0, + "1380": 33307830272.0, + "1385": 33308487680.0, + "1390": 33306521600.0, + "1395": 33307338752.0, + "1400": 33308430336.0, + "1405": 33307768832.0, + "1410": 33308041216.0, + "1415": 33307797504.0, + "1420": 33306605568.0, + "1425": 33307240448.0, + "1430": 33307322368.0, + "1435": 33307559936.0, + "1440": 33306662912.0, + "1445": 33307058176.0, + "1450": 33307705344.0, + "1455": 33307291648.0, + "1460": 33306861568.0, + "1465": 33306312704.0, + "1470": 33307394048.0, + "1475": 33307211776.0, + "1480": 33306527744.0, + "1485": 33307361280.0, + "1490": 33307693056.0, + "1495": 33307271168.0, + "1500": 33306820608.0, + "1505": 33307092992.0, + "1510": 33306624000.0, + "1515": 33307097088.0, + "1520": 33306931200.0, + "1525": 33307635712.0, + "1530": 33307353088.0, + "1535": 33306468352.0, + "1540": 33307172864.0, + "1545": 33307693056.0, + "1550": 33307938816.0, + "1555": 33307832320.0, + "1560": 33308182528.0, + "1565": 33307099136.0, + "1570": 33306798080.0, + "1575": 33307492352.0, + "1580": 33307688960.0, + "1585": 33307326464.0, + "1590": 33306988544.0, + "1595": 33306818560.0, + "1600": 33307836416.0, + "1605": 33307590656.0, + "1610": 33307168768.0, + "1615": 33306931200.0, + "1620": 33306732544.0, + "1625": 33308260352.0, + "1630": 33308227584.0, + "1635": 33306957824.0, + "1640": 33306759168.0, + "1645": 33306021888.0, + "1650": 33306689536.0, + "1655": 33307332608.0, + "1660": 33307170816.0, + "1665": 33306583040.0, + "1670": 33307535360.0, + "1675": 33306912768.0, + "1680": 33306675200.0, + "1685": 33307774976.0, + "1690": 33307783168.0, + "1695": 33307971584.0, + "1700": 33307623424.0, + "1705": 33307652096.0, + "1710": 33307731968.0, + "1715": 33308090368.0, + "1720": 33307172864.0, + "1725": 33307672576.0, + "1730": 33306355712.0, + "1735": 33308229632.0, + "1740": 33307142144.0, + "1745": 33308151808.0, + "1750": 33306898432.0, + "1755": 33307105280.0, + "1760": 33308000256.0, + "1765": 33307750400.0, + "1770": 33308450816.0, + "1775": 33308184576.0, + "1780": 33308129280.0, + "1785": 33307936768.0, + "1790": 33307238400.0, + "1795": 33307922432.0, + "1800": 33306900480.0, + "1805": 33307203584.0, + "1810": 33306923008.0, + "1815": 33307617280.0, + "1820": 33307664384.0, + "1825": 33308440576.0, + "1830": 33306843136.0, + "1835": 33307979776.0, + "1840": 33307588608.0, + "1845": 33307602944.0, + "1850": 33307774976.0, + "1855": 33307529216.0, + "1860": 33307054080.0, + "1865": 33307097088.0, + "1870": 33307373568.0, + "1875": 33306265600.0, + "1880": 33307275264.0, + "1885": 33307224064.0, + "1890": 33307324416.0, + "1895": 33307283456.0, + "1900": 33306810368.0, + "1905": 33307191296.0, + "1910": 33306884096.0, + "1915": 33308162048.0, + "1920": 33307664384.0, + "1925": 33305972736.0, + "1930": 33308504064.0, + "1935": 33307377664.0, + "1940": 33307119616.0, + "1945": 33307416576.0, + "1950": 33307746304.0, + "1955": 33307420672.0, + "1960": 33308073984.0, + "1965": 33307148288.0, + "1970": 33306775552.0, + "1975": 33308207104.0, + "1980": 33307473920.0, + "1985": 33307095040.0, + "1990": 33307527168.0, + "1995": 33307037696.0, + "2000": 33308801024.0, + "2005": 33307985920.0, + "2010": 33307516928.0, + "2015": 33307604992.0, + "2020": 33307406336.0, + "2025": 33307719680.0, + "2030": 33308381184.0, + "2035": 33307914240.0, + "2040": 33307324416.0, + "2045": 33306476544.0, + "2050": 33308246016.0, + "2055": 33307430912.0, + "2060": 33307912192.0, + "2065": 33307543552.0, + "2070": 33307670528.0, + "2075": 33307482112.0, + "2080": 33307871232.0, + "2085": 33306722304.0, + "2090": 33307549696.0, + "2095": 33307260928.0, + "2100": 33306765312.0, + "2105": 33306847232.0, + "2110": 33307332608.0, + "2115": 33306480640.0, + "2120": 33307168768.0, + "2125": 33307277312.0, + "2130": 33307314176.0, + "2135": 33307752448.0, + "2140": 33306710016.0, + "2145": 33307478016.0, + "2150": 33307729920.0, + "2155": 33306943488.0, + "2160": 33307508736.0, + "2165": 33307049984.0, + "2170": 33307158528.0, + "2175": 33306599424.0, + "2180": 33307054080.0, + "2185": 33307017216.0, + "2190": 33307119616.0, + "2195": 33307289600.0, + "2200": 33306726400.0, + "2205": 33306636288.0, + "2210": 33307639808.0, + "2215": 33308215296.0, + "2220": 33307314176.0, + "2225": 33307437056.0, + "2230": 33306318848.0, + "2235": 33306941440.0, + "2240": 33308131328.0, + "2245": 33307707392.0, + "2250": 33307256832.0, + "2255": 33306845184.0, + "2260": 33307736064.0, + "2265": 33308620800.0, + "2270": 33307357184.0, + "2275": 33308151808.0, + "2280": 33307981824.0, + "2285": 33307922432.0, + "2290": 33306767360.0, + "2295": 33307670528.0, + "2300": 33307179008.0, + "2305": 33307545600.0, + "2310": 33307924480.0, + "2315": 33307396096.0, + "2320": 33307725824.0, + "2325": 33308024832.0, + "2330": 33307793408.0, + "2335": 33307019264.0, + "2340": 33307162624.0, + "2345": 33307934720.0, + "2350": 33306232832.0, + "2355": 33307719680.0, + "2360": 33307375616.0, + "2365": 33306537984.0, + "2370": 33307279360.0, + "2375": 33308131328.0, + "2380": 33307136000.0, + "2385": 33307490304.0, + "2390": 33307316224.0, + "2395": 33306587136.0, + "2400": 33307594752.0, + "2405": 33308393472.0, + "2410": 33306726400.0, + "2415": 33307506688.0, + "2420": 33308407808.0, + "2425": 33307942912.0, + "2430": 33308116992.0, + "2435": 33307308032.0, + "2440": 33308362752.0, + "2445": 33308071936.0, + "2450": 33307740160.0, + "2455": 33307959296.0, + "2460": 33308258304.0, + "2465": 33307299840.0, + "2470": 33307056128.0, + "2475": 33307224064.0, + "2480": 33307713536.0, + "2485": 33306550272.0, + "2490": 33306992640.0, + "2495": 33307232256.0, + "2500": 33307095040.0, + "2505": 33307107328.0, + "2510": 33307488256.0, + "2515": 33308360704.0, + "2520": 33307369472.0, + "2525": 33306959872.0, + "2530": 33307258880.0, + "2535": 33307082752.0, + "2540": 33308633088.0, + "2545": 33308542976.0, + "2550": 33308002304.0, + "2555": 33307961344.0, + "2560": 33307328512.0, + "2565": 33308299264.0, + "2570": 33307770880.0, + "2575": 33307877376.0, + "2580": 33307990016.0, + "2585": 33308016640.0, + "2590": 33308135424.0, + "2595": 33307617280.0, + "2600": 33306667008.0, + "2605": 33307422720.0, + "2610": 33306683392.0, + "2615": 33308669952.0, + "2620": 33308616704.0, + "2625": 33308366848.0, + "2630": 33307574272.0, + "2635": 33308166144.0, + "2640": 33307983872.0, + "2645": 33307609088.0, + "2650": 33307807744.0, + "2655": 33306955776.0, + "2660": 33307273216.0, + "2665": 33307709440.0, + "2670": 33307693056.0, + "2675": 33307731968.0, + "2680": 33308227584.0, + "2685": 33307742208.0, + "2690": 33307734016.0, + "2695": 33307424768.0, + "2700": 33306644480.0, + "2705": 33306300416.0, + "2710": 33307881472.0, + "2715": 33307488256.0, + "2720": 33307318272.0, + "2725": 33307604992.0, + "2730": 33306710016.0, + "2735": 33308049408.0, + "2740": 33307437056.0, + "2745": 33307572224.0, + "2750": 33307136000.0, + "2755": 33307584512.0, + "2760": 33307355136.0, + "2765": 33307713536.0, + "2770": 33308000256.0, + "2775": 33306460160.0, + "2780": 33306923008.0, + "2785": 33307017216.0, + "2790": 33306720256.0, + "2795": 33307785216.0, + "2800": 33307234304.0, + "2805": 33306685440.0, + "2810": 33307469824.0, + "2815": 33308069888.0, + "2820": 33306460160.0, + "2825": 33307467776.0, + "2830": 33307666432.0, + "2835": 33307371520.0, + "2840": 33306904576.0, + "2845": 33308061696.0, + "2850": 33308520448.0, + "2855": 33307695104.0, + "2860": 33308487680.0, + "2865": 33307058176.0, + "2870": 33307303936.0, + "2875": 33307324416.0, + "2880": 33306968064.0, + "2885": 33307641856.0, + "2890": 33307785216.0, + "2895": 33308221440.0, + "2900": 33307596800.0, + "2905": 33307533312.0, + "2910": 33307459584.0, + "2915": 33307799552.0, + "2920": 33308461056.0, + "2925": 33307938816.0, + "2930": 33308268544.0, + "2935": 33308594176.0, + "2940": 33308170240.0, + "2945": 33307578368.0, + "2950": 33307590656.0, + "2955": 33308131328.0, + "2960": 33306839040.0, + "2965": 33307111424.0, + "2970": 33307570176.0, + "2975": 33307766784.0, + "2980": 33307600896.0, + "2985": 33307123712.0, + "2990": 33307641856.0, + "2995": 33307527168.0, + "3000": 33307863040.0, + "3005": 33306927104.0, + "3010": 33307738112.0, + "3015": 33308217344.0, + "3020": 33306697728.0, + "3025": 33306970112.0, + "3030": 33308127232.0, + "3035": 33308213248.0, + "3040": 33307578368.0, + "3045": 33308327936.0, + "3050": 33306910720.0, + "3055": 33307004928.0, + "3060": 33307602944.0, + "3065": 33306970112.0, + "3070": 33307985920.0, + "3075": 33306945536.0, + "3080": 33307312128.0, + "3085": 33306533888.0, + "3090": 33306933248.0, + "3095": 33307906048.0, + "3100": 33306793984.0, + "3105": 33307127808.0, + "3110": 33308295168.0, + "3115": 33307295744.0, + "3120": 33307897856.0, + "3125": 33307066368.0, + "3130": 33307781120.0, + "3135": 33307762688.0, + "3140": 33308196864.0, + "3145": 33306904576.0, + "3150": 33307140096.0, + "3155": 33306660864.0, + "3160": 33307514880.0, + "3165": 33307246592.0, + "3170": 33307613184.0, + "3175": 33307375616.0, + "3180": 33307551744.0, + "3185": 33307842560.0, + "3190": 33308342272.0, + "3195": 33308350464.0, + "3200": 33307799552.0, + "3205": 33307099136.0, + "3210": 33306869760.0, + "3215": 33307678720.0, + "3220": 33307111424.0, + "3225": 33307146240.0, + "3230": 33306972160.0, + "3235": 33307387904.0, + "3240": 33307521024.0, + "3245": 33307287552.0, + "3250": 33307523072.0, + "3255": 33307639808.0, + "3260": 33307092992.0, + "3265": 33308338176.0, + "3270": 33307273216.0, + "3275": 33307713536.0, + "3280": 33307719680.0, + "3285": 33308049408.0, + "3290": 33307484160.0, + "3295": 33307594752.0, + "3300": 33307228160.0, + "3305": 33306580992.0, + "3310": 33307541504.0, + "3315": 33307211776.0, + "3320": 33307324416.0, + "3325": 33306615808.0, + "3330": 33307777024.0, + "3335": 33308135424.0, + "3340": 33307351040.0, + "3345": 33307131904.0, + "3350": 33307031552.0, + "3355": 33307791360.0, + "3360": 33307410432.0, + "3365": 33307090944.0, + "3370": 33306187776.0, + "3375": 33307113472.0, + "3380": 33308071936.0, + "3385": 33307717632.0, + "3390": 33306648576.0, + "3395": 33306781696.0, + "3400": 33307734016.0, + "3405": 33307570176.0, + "3410": 33307750400.0, + "3415": 33307920384.0, + "3420": 33308157952.0, + "3425": 33307500544.0, + "3430": 33307168768.0, + "3435": 33307645952.0, + "3440": 33307185152.0, + "3445": 33307459584.0, + "3450": 33306804224.0, + "3455": 33307662336.0, + "3460": 33306748928.0, + "3465": 33306497024.0, + "3470": 33306796032.0, + "3475": 33307947008.0, + "3480": 33308039168.0, + "3485": 33307676672.0, + "3490": 33306728448.0, + "3495": 33307115520.0, + "3500": 33306628096.0, + "3505": 33307537408.0, + "3510": 33306945536.0, + "3515": 33306902528.0, + "3520": 33307553792.0, + "3525": 33307590656.0, + "3530": 33307852800.0, + "3535": 33306773504.0, + "3540": 33307953152.0, + "3545": 33307463680.0, + "3550": 33307123712.0, + "3555": 33307738112.0, + "3560": 33307766784.0, + "3565": 33307088896.0, + "3570": 33306882048.0, + "3575": 33307443200.0, + "3580": 33306951680.0, + "3585": 33306841088.0, + "3590": 33308293120.0, + "3595": 33307723776.0, + "3600": 33307756544.0, + "3605": 33307930624.0, + "3610": 33307985920.0, + "3615": 33307222016.0, + "3620": 33307430912.0, + "3625": 33307148288.0, + "3630": 33306388480.0, + "3635": 33307035648.0, + "3640": 33307455488.0, + "3645": 33306906624.0, + "3650": 33307545600.0, + "3655": 33307336704.0, + "3660": 33306910720.0, + "3665": 33307623424.0, + "3670": 33306824704.0, + "3675": 33307590656.0, + "3680": 33307373568.0, + "3685": 33306505216.0, + "3690": 33307817984.0, + "3695": 33306890240.0, + "3700": 33306802176.0, + "3705": 33306945536.0, + "3710": 33306904576.0, + "3715": 33307754496.0, + "3720": 33308395520.0, + "3725": 33308112896.0, + "3730": 33307652096.0, + "3735": 33307867136.0, + "3740": 33307805696.0, + "3745": 33308069888.0, + "3750": 33307826176.0, + "3755": 33306439680.0, + "3760": 33306849280.0, + "3765": 33307471872.0, + "3770": 33307095040.0, + "3775": 33307492352.0, + "3780": 33308141568.0, + "3785": 33307910144.0, + "3790": 33307656192.0, + "3795": 33307727872.0, + "3800": 33307246592.0, + "3805": 33307848704.0, + "3810": 33307490304.0, + "3815": 33307357184.0, + "3820": 33307346944.0, + "3825": 33307619328.0, + "3830": 33308102656.0, + "3835": 33306849280.0, + "3840": 33307678720.0, + "3845": 33307258880.0, + "3850": 33307686912.0, + "3855": 33307467776.0, + "3860": 33307471872.0, + "3865": 33307439104.0, + "3870": 33307676672.0, + "3875": 33306865664.0, + "3880": 33307232256.0, + "3885": 33307099136.0, + "3890": 33307854848.0, + "3895": 33306370048.0, + "3900": 33306900480.0, + "3905": 33306824704.0, + "3910": 33307361280.0, + "3915": 33306591232.0, + "3920": 33307213824.0, + "3925": 33306980352.0, + "3930": 33308110848.0, + "3935": 33307179008.0, + "3940": 33307379712.0, + "3945": 33307813888.0, + "3950": 33307277312.0, + "3955": 33307203584.0, + "3960": 33307234304.0, + "3965": 33307121664.0, + "3970": 33307303936.0, + "3975": 33307144192.0, + "3980": 33307869184.0, + "3985": 33307660288.0, + "3990": 33307779072.0, + "3995": 33307795456.0, + "4000": 33307131904.0, + "4005": 33307238400.0, + "4010": 33307875328.0, + "4015": 33306726400.0, + "4020": 33308227584.0, + "4025": 33307799552.0, + "4030": 33307318272.0, + "4035": 33308190720.0, + "4040": 33307932672.0, + "4045": 33307291648.0, + "4050": 33307959296.0, + "4055": 33307447296.0, + "4060": 33307486208.0, + "4065": 33308088320.0, + "4070": 33307183104.0, + "4075": 33307201536.0, + "4080": 33308184576.0, + "4085": 33306406912.0, + "4090": 33307891712.0, + "4095": 33307031552.0, + "4100": 33308100608.0, + "4105": 33307258880.0, + "4110": 33307492352.0, + "4115": 33308344320.0, + "4120": 33306552320.0, + "4125": 33307611136.0, + "4130": 33306083328.0, + "4135": 33308463104.0, + "4140": 33307611136.0, + "4145": 33307455488.0, + "4150": 33307658240.0, + "4155": 33307133952.0, + "4160": 33308233728.0, + "4165": 33307408384.0, + "4170": 33306888192.0, + "4175": 33307852800.0, + "4180": 33307150336.0, + "4185": 33307127808.0, + "4190": 33307582464.0, + "4195": 33308610560.0, + "4200": 33308231680.0, + "4205": 33307906048.0, + "4210": 33308307456.0, + "4215": 33306363904.0, + "4220": 33306980352.0, + "4225": 33306318848.0, + "4230": 33307731968.0, + "4235": 33307142144.0, + "4240": 33307432960.0, + "4245": 33307097088.0, + "4250": 33307783168.0, + "4255": 33307365376.0, + "4260": 33306947584.0, + "4265": 33306611712.0, + "4270": 33306347520.0, + "4275": 33306624000.0, + "4280": 33307185152.0, + "4285": 33307922432.0, + "4290": 33307508736.0, + "4295": 33307658240.0, + "4300": 33308405760.0, + "4305": 33306474496.0, + "4310": 33307557888.0, + "4315": 33308307456.0, + "4320": 33307719680.0, + "4325": 33306824704.0, + "4330": 33307594752.0, + "4335": 33306144768.0, + "4340": 33307852800.0, + "4345": 33307342848.0, + "4350": 33308139520.0, + "4355": 33307713536.0, + "4360": 33307373568.0, + "4365": 33308065792.0, + "4370": 33306681344.0, + "4375": 33307770880.0, + "4380": 33307361280.0, + "4385": 33307086848.0, + "4390": 33307019264.0, + "4395": 33306986496.0, + "4400": 33307103232.0, + "4405": 33307664384.0, + "4410": 33307996160.0, + "4415": 33306990592.0, + "4420": 33306546176.0, + "4425": 33306904576.0, + "4430": 33307303936.0, + "4435": 33306763264.0, + "4440": 33308063744.0, + "4445": 33307242496.0, + "4450": 33307283456.0, + "4455": 33306654720.0, + "4460": 33307205632.0, + "4465": 33306867712.0, + "4470": 33307916288.0, + "4475": 33307791360.0, + "4480": 33308450816.0, + "4485": 33307547648.0, + "4490": 33307090944.0, + "4495": 33307000832.0, + "4500": 33306935296.0, + "4505": 33307099136.0, + "4510": 33307525120.0, + "4515": 33307367424.0, + "4520": 33307813888.0, + "4525": 33307715584.0, + "4530": 33307901952.0, + "4535": 33307174912.0, + "4540": 33306880000.0, + "4545": 33307138048.0, + "4550": 33306873856.0, + "4555": 33306316800.0, + "4560": 33305849856.0, + "4565": 33307187200.0, + "4570": 33307260928.0, + "4575": 33307410432.0, + "4580": 33307201536.0, + "4585": 33306920960.0, + "4590": 33307355136.0, + "4595": 33307346944.0, + "4600": 33307856896.0, + "4605": 33307752448.0, + "4610": 33307095040.0, + "4615": 33306286080.0, + "4620": 33306699776.0, + "4625": 33308069888.0, + "4630": 33307439104.0, + "4635": 33306900480.0, + "4640": 33307076608.0, + "4645": 33308160000.0, + "4650": 33307758592.0, + "4655": 33307865088.0, + "4660": 33306255360.0, + "4665": 33307641856.0, + "4670": 33307912192.0, + "4675": 33306603520.0, + "4680": 33307799552.0, + "4685": 33307488256.0, + "4690": 33307394048.0, + "4695": 33306763264.0, + "4700": 33307873280.0, + "4705": 33308106752.0, + "4710": 33307617280.0, + "4715": 33307047936.0, + "4720": 33307901952.0, + "4725": 33307793408.0, + "4730": 33308123136.0, + "4735": 33307451392.0, + "4740": 33307623424.0, + "4745": 33306857472.0, + "4750": 33308436480.0, + "4755": 33307260928.0, + "4760": 33307975680.0, + "4765": 33307965440.0, + "4770": 33306859520.0, + "4775": 33307922432.0, + "4780": 33306978304.0, + "4785": 33306869760.0, + "4790": 33307084800.0, + "4795": 33307226112.0, + "4800": 33307961344.0, + "4805": 33308334080.0, + "4810": 33305587712.0, + "4815": 33307928576.0, + "4820": 33307875328.0, + "4825": 33306957824.0, + "4830": 33307797504.0, + "4835": 33306116096.0, + "4840": 33307654144.0, + "4845": 33307131904.0, + "4850": 33308055552.0, + "4855": 33305792512.0, + "4860": 33307402240.0, + "4865": 33307086848.0, + "4870": 33307637760.0, + "4875": 33307789312.0, + "4880": 33307701248.0, + "4885": 33308010496.0, + "4890": 33307039744.0, + "4895": 33307369472.0, + "4900": 33307127808.0, + "4905": 33306988544.0, + "4910": 33308276736.0, + "4915": 33307090944.0, + "4920": 33307015168.0, + "4925": 33308043264.0, + "4930": 33307607040.0, + "4935": 33308209152.0, + "4940": 33307725824.0, + "4945": 33307985920.0, + "4950": 33307582464.0, + "4955": 33307297792.0, + "4960": 33307639808.0, + "4965": 33307445248.0, + "4970": 33306869760.0, + "4975": 33306787840.0, + "4980": 33307099136.0, + "4985": 33307635712.0, + "4990": 33307406336.0, + "4995": 33307471872.0, + "5000": 33307375616.0, + "5005": 33307672576.0, + "5010": 33306970112.0, + "5015": 33307244544.0, + "5020": 33306966016.0, + "5025": 33307705344.0, + "5030": 33307463680.0, + "5035": 33306818560.0, + "5040": 33306972160.0, + "5045": 33308157952.0, + "5050": 33306376192.0, + "5055": 33307594752.0, + "5060": 33308471296.0, + "5065": 33307455488.0, + "5070": 33307301888.0, + "5075": 33307488256.0, + "5080": 33307910144.0, + "5085": 33307635712.0, + "5090": 33307406336.0, + "5095": 33307254784.0, + "5100": 33306828800.0, + "5105": 33307852800.0, + "5110": 33308258304.0, + "5115": 33307228160.0, + "5120": 33307955200.0, + "5125": 33305640960.0, + "5130": 33306683392.0, + "5135": 33307336704.0, + "5140": 33307834368.0, + "5145": 33307060224.0, + "5150": 33307023360.0, + "5155": 33307308032.0, + "5160": 33306664960.0, + "5165": 33307123712.0, + "5170": 33306935296.0, + "5175": 33308094464.0, + "5180": 33306566656.0, + "5185": 33306796032.0, + "5190": 33307545600.0, + "5195": 33308067840.0, + "5200": 33307754496.0, + "5205": 33307445248.0, + "5210": 33306785792.0, + "5215": 33307551744.0, + "5220": 33308188672.0, + "5225": 33307338752.0, + "5230": 33307283456.0, + "5235": 33306976256.0, + "5240": 33308041216.0, + "5245": 33308340224.0, + "5250": 33308153856.0, + "5255": 33307590656.0, + "5260": 33306896384.0, + "5265": 33308303360.0, + "5270": 33308796928.0, + "5275": 33307949056.0, + "5280": 33306157056.0, + "5285": 33307904000.0, + "5290": 33308143616.0, + "5295": 33306533888.0, + "5300": 33307912192.0, + "5305": 33308338176.0, + "5310": 33308688384.0, + "5315": 33308045312.0, + "5320": 33306206208.0, + "5325": 33308219392.0, + "5330": 33308012544.0, + "5335": 33307602944.0, + "5340": 33306685440.0, + "5345": 33308209152.0, + "5350": 33307150336.0, + "5355": 33308176384.0, + "5360": 33307273216.0, + "5365": 33307850752.0, + "5370": 33307222016.0, + "5375": 33307803648.0, + "5380": 33307617280.0, + "5385": 33307179008.0, + "5390": 33307389952.0, + "5395": 33306927104.0, + "5400": 33307518976.0, + "5405": 33307400192.0, + "5410": 33307598848.0, + "5415": 33307846656.0, + "5420": 33307490304.0, + "5425": 33307459584.0, + "5430": 33307283456.0, + "5435": 33307453440.0, + "5440": 33307383808.0, + "5445": 33307117568.0, + "5450": 33307832320.0, + "5455": 33307582464.0, + "5460": 33306963968.0, + "5465": 33306947584.0, + "5470": 33307355136.0, + "5475": 33306748928.0, + "5480": 33306435584.0, + "5485": 33307590656.0, + "5490": 33307787264.0, + "5495": 33307568128.0, + "5500": 33307351040.0, + "5505": 33307568128.0, + "5510": 33307426816.0, + "5515": 33307451392.0, + "5520": 33307549696.0, + "5525": 33307000832.0, + "5530": 33307566080.0, + "5535": 33307664384.0, + "5540": 33306966016.0, + "5545": 33307781120.0, + "5550": 33307275264.0, + "5555": 33307269120.0, + "5560": 33307576320.0, + "5565": 33307377664.0, + "5570": 33307052032.0, + "5575": 33306978304.0, + "5580": 33307965440.0, + "5585": 33307494400.0, + "5590": 33308055552.0, + "5595": 33306943488.0, + "5600": 33306542080.0, + "5605": 33307680768.0, + "5610": 33308542976.0, + "5615": 33307826176.0, + "5620": 33308108800.0, + "5625": 33308225536.0, + "5630": 33308069888.0, + "5635": 33307760640.0, + "5640": 33307500544.0, + "5645": 33307930624.0, + "5650": 33306755072.0, + "5655": 33308192768.0, + "5660": 33308631040.0, + "5665": 33307418624.0, + "5670": 33307504640.0, + "5675": 33307715584.0, + "5680": 33307910144.0, + "5685": 33307996160.0, + "5690": 33307478016.0, + "5695": 33308164096.0, + "5700": 33307906048.0, + "5705": 33307750400.0, + "5710": 33306779648.0, + "5715": 33307219968.0, + "5720": 33307750400.0, + "5725": 33307537408.0, + "5730": 33307262976.0, + "5735": 33306767360.0, + "5740": 33307508736.0, + "5745": 33306753024.0, + "5750": 33306636288.0, + "5755": 33306943488.0, + "5760": 33307553792.0, + "5765": 33307842560.0, + "5770": 33307047936.0, + "5775": 33307348992.0, + "5780": 33306361856.0, + "5785": 33307709440.0, + "5790": 33307832320.0, + "5795": 33307406336.0, + "5800": 33307056128.0, + "5805": 33307631616.0, + "5810": 33307766784.0, + "5815": 33307971584.0, + "5820": 33307447296.0, + "5825": 33307084800.0, + "5830": 33307324416.0, + "5835": 33307127808.0, + "5840": 33307729920.0, + "5845": 33307088896.0, + "5850": 33307635712.0, + "5855": 33307119616.0, + "5860": 33306703872.0, + "5865": 33307291648.0, + "5870": 33307613184.0, + "5875": 33307893760.0, + "5880": 33307893760.0, + "5885": 33307301888.0, + "5890": 33307830272.0, + "5895": 33306671104.0, + "5900": 33306488832.0, + "5905": 33308141568.0, + "5910": 33307373568.0, + "5915": 33307330560.0, + "5920": 33307656192.0, + "5925": 33307533312.0, + "5930": 33307848704.0, + "5935": 33307586560.0, + "5940": 33307602944.0, + "5945": 33307631616.0, + "5950": 33306615808.0, + "5955": 33307719680.0, + "5960": 33308553216.0, + "5965": 33308676096.0, + "5970": 33308313600.0, + "5975": 33306810368.0, + "5980": 33307222016.0, + "5985": 33307367424.0, + "5990": 33307119616.0, + "5995": 33307166720.0, + "6000": 33307822080.0, + "6005": 33307553792.0, + "6010": 33307756544.0, + "6015": 33306392576.0, + "6020": 33308116992.0, + "6025": 33307738112.0, + "6030": 33307459584.0, + "6035": 33306920960.0, + "6040": 33307701248.0, + "6045": 33307932672.0, + "6050": 33307496448.0, + "6055": 33307133952.0, + "6060": 33306370048.0, + "6065": 33307521024.0, + "6070": 33307244544.0, + "6075": 33306447872.0, + "6080": 33306963968.0, + "6085": 33307932672.0, + "6090": 33307293696.0, + "6095": 33307058176.0, + "6100": 33307449344.0, + "6105": 33307613184.0, + "6110": 33307779072.0, + "6115": 33306832896.0, + "6120": 33306732544.0, + "6125": 33306488832.0, + "6130": 33308866560.0, + "6135": 33308000256.0, + "6140": 33307906048.0, + "6145": 33308504064.0, + "6150": 33307826176.0, + "6155": 33306906624.0, + "6160": 33307533312.0, + "6165": 33307578368.0, + "6170": 33307891712.0, + "6175": 33307537408.0, + "6180": 33307803648.0, + "6185": 33308125184.0, + "6190": 33307342848.0, + "6195": 33308135424.0, + "6200": 33306468352.0, + "6205": 33308026880.0, + "6210": 33308028928.0, + "6215": 33308157952.0, + "6220": 33307662336.0, + "6225": 33307344896.0, + "6230": 33308231680.0, + "6235": 33307148288.0, + "6240": 33308809216.0, + "6245": 33307017216.0, + "6250": 33307234304.0, + "6255": 33308430336.0, + "6260": 33307246592.0, + "6265": 33307418624.0, + "6270": 33308319744.0, + "6275": 33307090944.0, + "6280": 33307404288.0, + "6285": 33308227584.0, + "6290": 33307656192.0, + "6295": 33306865664.0, + "6300": 33307596800.0, + "6305": 33308192768.0, + "6310": 33307695104.0, + "6315": 33307361280.0, + "6320": 33306775552.0, + "6325": 33307557888.0, + "6330": 33307639808.0, + "6335": 33307820032.0, + "6340": 33307410432.0, + "6345": 33307410432.0, + "6350": 33308256256.0, + "6355": 33307082752.0, + "6360": 33306855424.0, + "6365": 33307418624.0, + "6370": 33307066368.0, + "6375": 33307891712.0, + "6380": 33307779072.0, + "6385": 33306128384.0, + "6390": 33306884096.0, + "6395": 33307060224.0, + "6400": 33307250688.0, + "6405": 33308135424.0, + "6410": 33308155904.0, + "6415": 33307101184.0, + "6420": 33306318848.0, + "6425": 33308065792.0, + "6430": 33307813888.0, + "6435": 33307842560.0, + "6440": 33308571648.0, + "6445": 33306138624.0, + "6450": 33307762688.0, + "6455": 33308119040.0, + "6460": 33308037120.0, + "6465": 33308467200.0, + "6470": 33307181056.0, + "6475": 33307246592.0, + "6480": 33306855424.0, + "6485": 33308440576.0, + "6490": 33307863040.0, + "6495": 33306857472.0, + "6500": 33306529792.0, + "6505": 33307097088.0, + "6510": 33307842560.0, + "6515": 33307095040.0, + "6520": 33307848704.0, + "6525": 33307596800.0, + "6530": 33307117568.0, + "6535": 33307811840.0, + "6540": 33307645952.0, + "6545": 33307211776.0, + "6550": 33308196864.0, + "6555": 33307213824.0, + "6560": 33307326464.0, + "6565": 33306490880.0, + "6570": 33306877952.0, + "6575": 33307199488.0, + "6580": 33308370944.0, + "6585": 33307828224.0, + "6590": 33307871232.0, + "6595": 33307590656.0, + "6600": 33306578944.0, + "6605": 33307496448.0, + "6610": 33307912192.0, + "6615": 33307521024.0, + "6620": 33307189248.0, + "6625": 33306961920.0, + "6630": 33306800128.0, + "6635": 33306957824.0, + "6640": 33307762688.0, + "6645": 33306427392.0, + "6650": 33307672576.0, + "6655": 33305133056.0, + "6660": 33307598848.0, + "6665": 33306884096.0, + "6670": 33307500544.0, + "6675": 33307592704.0, + "6680": 33306923008.0, + "6685": 33307084800.0, + "6690": 33307402240.0, + "6695": 33307963392.0, + "6700": 33307336704.0, + "6705": 33306845184.0, + "6710": 33307230208.0, + "6715": 33306310656.0, + "6720": 33307834368.0, + "6725": 33308094464.0, + "6730": 33308327936.0, + "6735": 33308092416.0, + "6740": 33306873856.0, + "6745": 33308082176.0, + "6750": 33306112000.0, + "6755": 33306810368.0, + "6760": 33307394048.0, + "6765": 33307414528.0, + "6770": 33308286976.0, + "6775": 33308618752.0, + "6780": 33306904576.0, + "6785": 33308182528.0, + "6790": 33308057600.0, + "6795": 33307049984.0, + "6800": 33306744832.0, + "6805": 33307242496.0, + "6810": 33307176960.0, + "6815": 33307779072.0, + "6820": 33306849280.0, + "6825": 33307623424.0, + "6830": 33307887616.0, + "6835": 33307670528.0, + "6840": 33308348416.0, + "6845": 33308184576.0, + "6850": 33307727872.0, + "6855": 33307252736.0, + "6860": 33307680768.0, + "6865": 33306963968.0, + "6870": 33307099136.0, + "6875": 33307037696.0, + "6880": 33307635712.0, + "6885": 33307615232.0, + "6890": 33307652096.0, + "6895": 33307369472.0, + "6900": 33307947008.0, + "6905": 33307334656.0, + "6910": 33306824704.0, + "6915": 33307537408.0, + "6920": 33306619904.0, + "6925": 33306408960.0, + "6930": 33306765312.0, + "6935": 33306609664.0, + "6940": 33307623424.0, + "6945": 33307160576.0, + "6950": 33307463680.0, + "6955": 33306507264.0, + "6960": 33307185152.0, + "6965": 33307019264.0, + "6970": 33307598848.0, + "6975": 33307435008.0, + "6980": 33307238400.0, + "6985": 33306222592.0, + "6990": 33308581888.0, + "6995": 33307254784.0, + "7000": 33308035072.0, + "7005": 33308233728.0, + "7010": 33307092992.0, + "7015": 33307193344.0, + "7020": 33307643904.0, + "7025": 33308274688.0, + "7030": 33307019264.0, + "7035": 33308454912.0, + "7040": 33308086272.0, + "7045": 33307277312.0, + "7050": 33307172864.0, + "7055": 33306599424.0, + "7060": 33307613184.0, + "7065": 33307031552.0, + "7070": 33306243072.0, + "7075": 33308037120.0, + "7080": 33306759168.0, + "7085": 33308033024.0, + "7090": 33307971584.0, + "7095": 33306873856.0, + "7100": 33308522496.0, + "7105": 33307363328.0, + "7110": 33308063744.0, + "7115": 33307770880.0, + "7120": 33307906048.0, + "7125": 33307443200.0, + "7130": 33307574272.0, + "7135": 33307541504.0, + "7140": 33306765312.0, + "7145": 33307854848.0, + "7150": 33306853376.0, + "7155": 33307856896.0, + "7160": 33307906048.0, + "7165": 33308184576.0, + "7170": 33308272640.0, + "7175": 33306417152.0, + "7180": 33307107328.0, + "7185": 33307860992.0, + "7190": 33307078656.0, + "7195": 33307494400.0, + "7200": 33307613184.0, + "7205": 33307680768.0, + "7210": 33307990016.0, + "7215": 33306822656.0, + "7220": 33306730496.0, + "7225": 33307539456.0, + "7230": 33307744256.0, + "7235": 33306136576.0, + "7240": 33307189248.0, + "7245": 33307236352.0, + "7250": 33306980352.0, + "7255": 33307832320.0, + "7260": 33307426816.0, + "7265": 33307340800.0, + "7270": 33307844608.0, + "7275": 33308094464.0, + "7280": 33308602368.0, + "7285": 33307498496.0, + "7290": 33307920384.0, + "7295": 33307426816.0, + "7300": 33306392576.0, + "7305": 33306718208.0, + "7310": 33307260928.0, + "7315": 33307527168.0, + "7320": 33306963968.0, + "7325": 33308188672.0, + "7330": 33307799552.0, + "7335": 33307717632.0, + "7340": 33307238400.0, + "7345": 33307365376.0, + "7350": 33307314176.0, + "7355": 33307940864.0, + "7360": 33306284032.0, + "7365": 33307893760.0, + "7370": 33306275840.0, + "7375": 33307873280.0, + "7380": 33309245440.0, + "7385": 33306730496.0, + "7390": 33307758592.0, + "7395": 33306609664.0, + "7400": 33307652096.0, + "7405": 33306427392.0, + "7410": 33308524544.0, + "7415": 33307961344.0, + "7420": 33307242496.0, + "7425": 33307811840.0, + "7430": 33307119616.0, + "7435": 33307428864.0, + "7440": 33307709440.0, + "7445": 33308342272.0, + "7450": 33306980352.0, + "7455": 33307351040.0, + "7460": 33306730496.0, + "7465": 33306537984.0, + "7470": 33307664384.0, + "7475": 33308037120.0, + "7480": 33307179008.0, + "7485": 33308467200.0, + "7490": 33307822080.0, + "7495": 33306638336.0, + "7500": 33306689536.0, + "7505": 33307717632.0, + "7510": 33306789888.0, + "7515": 33307518976.0, + "7520": 33307260928.0, + "7525": 33307676672.0, + "7530": 33306916864.0, + "7535": 33306996736.0, + "7540": 33306566656.0, + "7545": 33306720256.0, + "7550": 33307584512.0, + "7555": 33307471872.0, + "7560": 33306736640.0, + "7565": 33306292224.0, + "7570": 33307066368.0, + "7575": 33306871808.0, + "7580": 33307324416.0, + "7585": 33307115520.0, + "7590": 33306341376.0, + "7595": 33307744256.0, + "7600": 33307482112.0, + "7605": 33308149760.0, + "7610": 33307525120.0, + "7615": 33307656192.0, + "7620": 33307224064.0, + "7625": 33307158528.0, + "7630": 33307742208.0, + "7635": 33308012544.0, + "7640": 33307049984.0, + "7645": 33308631040.0, + "7650": 33307865088.0, + "7655": 33308229632.0, + "7660": 33307043840.0, + "7665": 33307037696.0, + "7670": 33306791936.0, + "7675": 33307320320.0, + "7680": 33307293696.0, + "7685": 33307432960.0, + "7690": 33307103232.0, + "7695": 33307568128.0, + "7700": 33306312704.0, + "7705": 33307795456.0, + "7710": 33307996160.0, + "7715": 33307133952.0, + "7720": 33308164096.0, + "7725": 33307254784.0, + "7730": 33307830272.0, + "7735": 33307721728.0, + "7740": 33307492352.0, + "7745": 33307783168.0, + "7750": 33306728448.0, + "7755": 33307734016.0, + "7760": 33308614656.0, + "7765": 33306791936.0, + "7770": 33308278784.0, + "7775": 33307873280.0, + "7780": 33307078656.0, + "7785": 33306990592.0, + "7790": 33307062272.0, + "7795": 33307680768.0, + "7800": 33306982400.0, + "7805": 33308090368.0, + "7810": 33307308032.0, + "7815": 33307078656.0, + "7820": 33307951104.0, + "7825": 33306480640.0, + "7830": 33307258880.0, + "7835": 33307891712.0, + "7840": 33307432960.0, + "7845": 33307066368.0, + "7850": 33306910720.0, + "7855": 33307938816.0, + "7860": 33307308032.0, + "7865": 33308264448.0, + "7870": 33307729920.0, + "7875": 33308129280.0, + "7880": 33308352512.0, + "7885": 33307398144.0, + "7890": 33306920960.0, + "7895": 33307156480.0, + "7900": 33308221440.0, + "7905": 33308047360.0, + "7910": 33306146816.0, + "7915": 33306910720.0, + "7920": 33307090944.0, + "7925": 33308264448.0, + "7930": 33307908096.0, + "7935": 33307465728.0, + "7940": 33307375616.0, + "7945": 33307848704.0, + "7950": 33308090368.0, + "7955": 33307043840.0, + "7960": 33307168768.0, + "7965": 33307846656.0, + "7970": 33306454016.0, + "7975": 33307635712.0, + "7980": 33307555840.0, + "7985": 33307131904.0, + "7990": 33306732544.0, + "7995": 33307430912.0, + "8000": 33307674624.0, + "8005": 33307746304.0, + "8010": 33308002304.0, + "8015": 33306906624.0, + "8020": 33307895808.0, + "8025": 33308231680.0, + "8030": 33307664384.0, + "8035": 33306888192.0, + "8040": 33308024832.0, + "8045": 33307693056.0, + "8050": 33306583040.0, + "8055": 33307201536.0, + "8060": 33307594752.0, + "8065": 33308260352.0, + "8070": 33307426816.0, + "8075": 33308108800.0, + "8080": 33308178432.0, + "8085": 33307308032.0, + "8090": 33306513408.0, + "8095": 33306968064.0, + "8100": 33308413952.0, + "8105": 33308241920.0, + "8110": 33307471872.0, + "8115": 33307832320.0, + "8120": 33307193344.0, + "8125": 33307295744.0, + "8130": 33306775552.0, + "8135": 33307097088.0, + "8140": 33307865088.0, + "8145": 33306746880.0, + "8150": 33307023360.0, + "8155": 33306806272.0, + "8160": 33307373568.0, + "8165": 33307631616.0, + "8170": 33306769408.0, + "8175": 33308239872.0, + "8180": 33307240448.0, + "8185": 33307471872.0, + "8190": 33308184576.0, + "8195": 33307754496.0, + "8200": 33307459584.0, + "8205": 33307850752.0, + "8210": 33306810368.0, + "8215": 33306222592.0, + "8220": 33307795456.0, + "8225": 33308078080.0, + "8230": 33306132480.0, + "8235": 33308764160.0, + "8240": 33307432960.0, + "8245": 33307867136.0, + "8250": 33308260352.0, + "8255": 33308334080.0, + "8260": 33308233728.0, + "8265": 33308528640.0, + "8270": 33307699200.0, + "8275": 33306748928.0, + "8280": 33307635712.0, + "8285": 33308008448.0, + "8290": 33307590656.0, + "8295": 33308041216.0, + "8300": 33307516928.0, + "8305": 33307879424.0, + "8310": 33307576320.0, + "8315": 33308366848.0, + "8320": 33307496448.0, + "8325": 33307256832.0, + "8330": 33307680768.0, + "8335": 33306669056.0, + "8340": 33306990592.0, + "8345": 33307936768.0, + "8350": 33307955200.0, + "8355": 33307791360.0, + "8360": 33306640384.0, + "8365": 33307586560.0, + "8370": 33307648000.0, + "8375": 33306890240.0, + "8380": 33307764736.0, + "8385": 33307871232.0, + "8390": 33307023360.0, + "8395": 33307664384.0, + "8400": 33307510784.0, + "8405": 33307338752.0, + "8410": 33307316224.0, + "8415": 33307566080.0, + "8420": 33307891712.0, + "8425": 33307676672.0, + "8430": 33307693056.0, + "8435": 33306812416.0, + "8440": 33307762688.0, + "8445": 33307447296.0, + "8450": 33307426816.0, + "8455": 33306660864.0, + "8460": 33307385856.0, + "8465": 33308121088.0, + "8470": 33307664384.0, + "8475": 33307023360.0, + "8480": 33308082176.0, + "8485": 33307346944.0, + "8490": 33307471872.0, + "8495": 33307889664.0, + "8500": 33307492352.0, + "8505": 33307502592.0, + "8510": 33307815936.0, + "8515": 33307983872.0, + "8520": 33306431488.0, + "8525": 33306537984.0, + "8530": 33307199488.0, + "8535": 33307848704.0, + "8540": 33307459584.0, + "8545": 33307432960.0, + "8550": 33307600896.0, + "8555": 33308553216.0, + "8560": 33307701248.0, + "8565": 33307799552.0, + "8570": 33307934720.0, + "8575": 33306324992.0, + "8580": 33307648000.0, + "8585": 33307951104.0, + "8590": 33308108800.0, + "8595": 33308037120.0, + "8600": 33308182528.0, + "8605": 33307410432.0, + "8610": 33308102656.0, + "8615": 33307342848.0, + "8620": 33306077184.0, + "8625": 33308153856.0, + "8630": 33307807744.0, + "8635": 33306734592.0, + "8640": 33307867136.0, + "8645": 33307129856.0, + "8650": 33307430912.0, + "8655": 33307545600.0, + "8660": 33307975680.0, + "8665": 33307822080.0, + "8670": 33307156480.0, + "8675": 33307758592.0, + "8680": 33308340224.0, + "8685": 33307357184.0, + "8690": 33308479488.0, + "8695": 33306523648.0, + "8700": 33307404288.0, + "8705": 33307791360.0, + "8710": 33308004352.0, + "8715": 33308108800.0, + "8720": 33307424768.0, + "8725": 33307564032.0, + "8730": 33306877952.0, + "8735": 33307199488.0, + "8740": 33307734016.0, + "8745": 33307248640.0, + "8750": 33307912192.0, + "8755": 33307215872.0, + "8760": 33308012544.0, + "8765": 33306640384.0, + "8770": 33307977728.0, + "8775": 33306624000.0, + "8780": 33307357184.0, + "8785": 33306353664.0, + "8790": 33307518976.0, + "8795": 33308178432.0, + "8800": 33307113472.0, + "8805": 33307045888.0, + "8810": 33307252736.0, + "8815": 33307430912.0, + "8820": 33307568128.0, + "8825": 33306791936.0, + "8830": 33307529216.0, + "8835": 33306691584.0, + "8840": 33306529792.0, + "8845": 33307303936.0, + "8850": 33307901952.0, + "8855": 33308196864.0, + "8860": 33307965440.0, + "8865": 33307971584.0, + "8870": 33306595328.0, + "8875": 33306419200.0, + "8880": 33307508736.0, + "8885": 33306345472.0, + "8890": 33307373568.0, + "8895": 33307631616.0, + "8900": 33307330560.0, + "8905": 33308209152.0, + "8910": 33308155904.0, + "8915": 33306943488.0, + "8920": 33307381760.0, + "8925": 33307437056.0, + "8930": 33308041216.0, + "8935": 33307142144.0, + "8940": 33307768832.0, + "8945": 33308551168.0, + "8950": 33307682816.0, + "8955": 33307656192.0, + "8960": 33307787264.0, + "8965": 33306220544.0, + "8970": 33307693056.0, + "8975": 33307529216.0, + "8980": 33307027456.0, + "8985": 33308442624.0, + "8990": 33307588608.0, + "8995": 33308315648.0, + "9000": 33307787264.0, + "9005": 33307951104.0, + "9010": 33305649152.0, + "9015": 33307592704.0, + "9020": 33307033600.0, + "9025": 33307232256.0, + "9030": 33307793408.0, + "9035": 33307385856.0, + "9040": 33308012544.0, + "9045": 33307287552.0, + "9050": 33307701248.0, + "9055": 33306814464.0, + "9060": 33307975680.0, + "9065": 33307693056.0, + "9070": 33306888192.0, + "9075": 33307168768.0, + "9080": 33306818560.0, + "9085": 33307557888.0, + "9090": 33308200960.0, + "9095": 33306867712.0, + "9100": 33308563456.0, + "9105": 33306994688.0, + "9110": 33307004928.0, + "9115": 33307439104.0, + "9120": 33307340800.0, + "9125": 33307295744.0, + "9130": 33306771456.0, + "9135": 33307031552.0, + "9140": 33306497024.0, + "9145": 33307629568.0, + "9150": 33308002304.0, + "9155": 33307484160.0, + "9160": 33308100608.0, + "9165": 33307611136.0, + "9170": 33307897856.0, + "9175": 33307473920.0, + "9180": 33307977728.0, + "9185": 33307203584.0, + "9190": 33306693632.0, + "9195": 33306931200.0, + "9200": 33307779072.0, + "9205": 33307205632.0, + "9210": 33307637760.0, + "9215": 33307090944.0, + "9220": 33308454912.0, + "9225": 33307471872.0, + "9230": 33307322368.0, + "9235": 33307422720.0, + "9240": 33307242496.0, + "9245": 33308026880.0, + "9250": 33308203008.0, + "9255": 33307389952.0, + "9260": 33308825600.0, + "9265": 33306505216.0, + "9270": 33307426816.0, + "9275": 33307865088.0, + "9280": 33307435008.0, + "9285": 33307258880.0, + "9290": 33308000256.0, + "9295": 33307498496.0, + "9300": 33307301888.0, + "9305": 33307674624.0, + "9310": 33307031552.0, + "9315": 33306327040.0, + "9320": 33306834944.0, + "9325": 33307971584.0, + "9330": 33307910144.0, + "9335": 33307213824.0, + "9340": 33307385856.0, + "9345": 33307385856.0, + "9350": 33308127232.0, + "9355": 33306615808.0, + "9360": 33306697728.0, + "9365": 33307463680.0, + "9370": 33306355712.0, + "9375": 33307219968.0, + "9380": 33307224064.0, + "9385": 33308024832.0, + "9390": 33307830272.0, + "9395": 33307535360.0, + "9400": 33307031552.0, + "9405": 33307418624.0, + "9410": 33306822656.0, + "9415": 33307267072.0, + "9420": 33306994688.0, + "9425": 33306892288.0, + "9430": 33307199488.0, + "9435": 33306980352.0, + "9440": 33306451968.0, + "9445": 33308420096.0, + "9450": 33306755072.0, + "9455": 33306341376.0, + "9460": 33308131328.0, + "9465": 33307023360.0, + "9470": 33308307456.0, + "9475": 33308221440.0, + "9480": 33308037120.0, + "9485": 33308055552.0, + "9490": 33307908096.0, + "9495": 33306486784.0, + "9500": 33306490880.0, + "9505": 33307967488.0, + "9510": 33307125760.0, + "9515": 33307242496.0, + "9520": 33307670528.0, + "9525": 33307496448.0, + "9530": 33307731968.0, + "9535": 33307435008.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 36905754624.0, + "5": 45014786048.0, + "10": 45173362688.0, + "15": 45173362688.0, + "20": 45251878912.0, + "25": 45286207488.0, + "30": 45286207488.0, + "35": 45288939520.0, + "40": 45288939520.0, + "45": 45288939520.0, + "50": 45288939520.0, + "55": 45288939520.0, + "60": 45288939520.0, + "65": 45288939520.0, + "70": 45288939520.0, + "75": 45288939520.0, + "80": 45288939520.0, + "85": 45288939520.0, + "90": 45288939520.0, + "95": 45288939520.0, + "100": 45288939520.0, + "105": 45288939520.0, + "110": 45299392512.0, + "115": 45314936832.0, + "120": 45378736128.0, + "125": 45428596736.0, + "130": 45428596736.0, + "135": 45445640192.0, + "140": 45445640192.0, + "145": 45445640192.0, + "150": 45445640192.0, + "155": 45445640192.0, + "160": 45445640192.0, + "165": 45445640192.0, + "170": 45445640192.0, + "175": 45445640192.0, + "180": 45445640192.0, + "185": 45445640192.0, + "190": 45445640192.0, + "195": 45445640192.0, + "200": 45536641024.0, + "205": 45638885376.0, + "210": 45638885376.0, + "215": 45638885376.0, + "220": 45638885376.0, + "225": 45638885376.0, + "230": 45638885376.0, + "235": 45713887232.0, + "240": 45932376064.0, + "245": 45982269440.0, + "250": 45982269440.0, + "255": 45982269440.0, + "260": 46039670784.0, + "265": 46039670784.0, + "270": 46039670784.0, + "275": 46039670784.0, + "280": 46293884928.0, + "285": 46293884928.0, + "290": 46293884928.0, + "295": 46293884928.0, + "300": 46293884928.0, + "305": 46319267840.0, + "310": 46319267840.0, + "315": 46319267840.0, + "320": 46319267840.0, + "325": 46319267840.0, + "330": 46319267840.0, + "335": 46319267840.0, + "340": 46319267840.0, + "345": 46451261440.0, + "350": 46451261440.0, + "355": 46451261440.0, + "360": 46451261440.0, + "365": 46451261440.0, + "370": 46451261440.0, + "375": 46451261440.0, + "380": 46451261440.0, + "385": 46451261440.0, + "390": 46451261440.0, + "395": 46451261440.0, + "400": 46451261440.0, + "405": 46451261440.0, + "410": 46451261440.0, + "415": 46451261440.0, + "420": 46451261440.0, + "425": 46451261440.0, + "430": 46451261440.0, + "435": 46451261440.0, + "440": 46451261440.0, + "445": 46451261440.0, + "450": 46451261440.0, + "455": 46451261440.0, + "460": 46451261440.0, + "465": 46451261440.0, + "470": 46451261440.0, + "475": 46451261440.0, + "480": 46451261440.0, + "485": 46451261440.0, + "490": 46451261440.0, + "495": 46451261440.0, + "500": 46451261440.0, + "505": 46451261440.0, + "510": 46451261440.0, + "515": 46451261440.0, + "520": 46451261440.0, + "525": 46451261440.0, + "530": 46451261440.0, + "535": 46451261440.0, + "540": 46451261440.0, + "545": 46451261440.0, + "550": 46451261440.0, + "555": 46451261440.0, + "560": 46451261440.0, + "565": 46451261440.0, + "570": 46451261440.0, + "575": 46451261440.0, + "580": 46451261440.0, + "585": 46451261440.0, + "590": 46451261440.0, + "595": 46451261440.0, + "600": 46451261440.0, + "605": 46451261440.0, + "610": 46451261440.0, + "615": 46451261440.0, + "620": 46451261440.0, + "625": 46451261440.0, + "630": 46451261440.0, + "635": 46451261440.0, + "640": 46451261440.0, + "645": 46451261440.0, + "650": 46451261440.0, + "655": 46451261440.0, + "660": 46451261440.0, + "665": 46451261440.0, + "670": 46451261440.0, + "675": 46451261440.0, + "680": 46451261440.0, + "685": 46451261440.0, + "690": 46451261440.0, + "695": 46451261440.0, + "700": 46451261440.0, + "705": 46451261440.0, + "710": 46451261440.0, + "715": 46451261440.0, + "720": 46451261440.0, + "725": 46451261440.0, + "730": 46451261440.0, + "735": 46451261440.0, + "740": 46451261440.0, + "745": 46451261440.0, + "750": 46451261440.0, + "755": 46451261440.0, + "760": 46451261440.0, + "765": 46451261440.0, + "770": 46451261440.0, + "775": 46451261440.0, + "780": 46451261440.0, + "785": 46451261440.0, + "790": 46451261440.0, + "795": 46451261440.0, + "800": 46451261440.0, + "805": 46451261440.0, + "810": 46451261440.0, + "815": 46451261440.0, + "820": 46451261440.0, + "825": 46451261440.0, + "830": 46451261440.0, + "835": 46451261440.0, + "840": 46451261440.0, + "845": 46451261440.0, + "850": 46451261440.0, + "855": 46451261440.0, + "860": 46451261440.0, + "865": 46451261440.0, + "870": 46451261440.0, + "875": 46451261440.0, + "880": 46451261440.0, + "885": 46451261440.0, + "890": 46451261440.0, + "895": 46451261440.0, + "900": 46451261440.0, + "905": 46451261440.0, + "910": 46451261440.0, + "915": 46451261440.0, + "920": 46451261440.0, + "925": 46451261440.0, + "930": 46451261440.0, + "935": 46451261440.0, + "940": 46451261440.0, + "945": 46451261440.0, + "950": 46451261440.0, + "955": 46451261440.0, + "960": 45564735488.0, + "965": 45952081920.0, + "970": 45952081920.0, + "975": 46005657600.0, + "980": 46005657600.0, + "985": 46005657600.0, + "990": 46005657600.0, + "995": 46169923584.0, + "1000": 46169923584.0, + "1005": 46169923584.0, + "1010": 46169923584.0, + "1015": 46169923584.0, + "1020": 46169923584.0, + "1025": 46169923584.0, + "1030": 46169923584.0, + "1035": 46169923584.0, + "1040": 46169923584.0, + "1045": 46169923584.0, + "1050": 46169923584.0, + "1055": 46169923584.0, + "1060": 46169923584.0, + "1065": 46169923584.0, + "1070": 46169923584.0, + "1075": 46169923584.0, + "1080": 46169923584.0, + "1085": 46169923584.0, + "1090": 46169923584.0, + "1095": 46169923584.0, + "1100": 46169923584.0, + "1105": 46169923584.0, + "1110": 46169923584.0, + "1115": 46169923584.0, + "1120": 46169923584.0, + "1125": 46169923584.0, + "1130": 46169923584.0, + "1135": 46169923584.0, + "1140": 46169923584.0, + "1145": 46169923584.0, + "1150": 46169923584.0, + "1155": 46169923584.0, + "1160": 46169923584.0, + "1165": 46169923584.0, + "1170": 46169923584.0, + "1175": 46169923584.0, + "1180": 46192005120.0, + "1185": 46192005120.0, + "1190": 46192005120.0, + "1195": 46192005120.0, + "1200": 46192005120.0, + "1205": 46192005120.0, + "1210": 46192005120.0, + "1215": 46192005120.0, + "1220": 46192005120.0, + "1225": 46192005120.0, + "1230": 46192005120.0, + "1235": 46192005120.0, + "1240": 46192005120.0, + "1245": 46192005120.0, + "1250": 46192005120.0, + "1255": 46192005120.0, + "1260": 46192005120.0, + "1265": 46192005120.0, + "1270": 46192005120.0, + "1275": 46192005120.0, + "1280": 46192005120.0, + "1285": 46192005120.0, + "1290": 46192005120.0, + "1295": 46192005120.0, + "1300": 46192005120.0, + "1305": 46192005120.0, + "1310": 46192005120.0, + "1315": 46192005120.0, + "1320": 46192005120.0, + "1325": 46192005120.0, + "1330": 46192005120.0, + "1335": 46192005120.0, + "1340": 46192005120.0, + "1345": 46192005120.0, + "1350": 46192005120.0, + "1355": 46192005120.0, + "1360": 46192005120.0, + "1365": 46192005120.0, + "1370": 46192005120.0, + "1375": 46192005120.0, + "1380": 46192005120.0, + "1385": 46192005120.0, + "1390": 46192005120.0, + "1395": 46192005120.0, + "1400": 46192005120.0, + "1405": 46192005120.0, + "1410": 46192005120.0, + "1415": 46192005120.0, + "1420": 46192005120.0, + "1425": 46192005120.0, + "1430": 46192005120.0, + "1435": 46192005120.0, + "1440": 46192005120.0, + "1445": 46192005120.0, + "1450": 46192005120.0, + "1455": 46192005120.0, + "1460": 46192005120.0, + "1465": 46192005120.0, + "1470": 46192005120.0, + "1475": 46192005120.0, + "1480": 46192005120.0, + "1485": 46192005120.0, + "1490": 46192005120.0, + "1495": 46192005120.0, + "1500": 46192005120.0, + "1505": 46192005120.0, + "1510": 46192005120.0, + "1515": 46192005120.0, + "1520": 46192005120.0, + "1525": 46192005120.0, + "1530": 46192005120.0, + "1535": 46192005120.0, + "1540": 46192005120.0, + "1545": 46192005120.0, + "1550": 46260322304.0, + "1555": 46260322304.0, + "1560": 46260322304.0, + "1565": 46260322304.0, + "1570": 46260322304.0, + "1575": 46260322304.0, + "1580": 46260322304.0, + "1585": 46260322304.0, + "1590": 46260322304.0, + "1595": 46260322304.0, + "1600": 46260322304.0, + "1605": 46260322304.0, + "1610": 46260322304.0, + "1615": 46260322304.0, + "1620": 46260322304.0, + "1625": 46260322304.0, + "1630": 46260322304.0, + "1635": 46260322304.0, + "1640": 46260322304.0, + "1645": 46260322304.0, + "1650": 46260322304.0, + "1655": 46260322304.0, + "1660": 46260322304.0, + "1665": 46260322304.0, + "1670": 46260322304.0, + "1675": 46260322304.0, + "1680": 46260322304.0, + "1685": 46260322304.0, + "1690": 46260322304.0, + "1695": 46260322304.0, + "1700": 46260322304.0, + "1705": 46260322304.0, + "1710": 46260322304.0, + "1715": 46260322304.0, + "1720": 46260322304.0, + "1725": 46260322304.0, + "1730": 46260322304.0, + "1735": 46260322304.0, + "1740": 46260322304.0, + "1745": 46260322304.0, + "1750": 46260322304.0, + "1755": 46260322304.0, + "1760": 46260322304.0, + "1765": 46260322304.0, + "1770": 46260322304.0, + "1775": 46260322304.0, + "1780": 46260322304.0, + "1785": 46260322304.0, + "1790": 46260322304.0, + "1795": 46260322304.0, + "1800": 46260322304.0, + "1805": 46260322304.0, + "1810": 46260322304.0, + "1815": 46260322304.0, + "1820": 46260322304.0, + "1825": 46260322304.0, + "1830": 46260322304.0, + "1835": 46260322304.0, + "1840": 46260322304.0, + "1845": 46260322304.0, + "1850": 46260322304.0, + "1855": 46260322304.0, + "1860": 46260322304.0, + "1865": 46260322304.0, + "1870": 46260322304.0, + "1875": 46260322304.0, + "1880": 46260322304.0, + "1885": 46260322304.0, + "1890": 46260322304.0, + "1895": 46260322304.0, + "1900": 46260322304.0, + "1905": 46260322304.0, + "1910": 46260322304.0, + "1915": 46260322304.0, + "1920": 46260322304.0, + "1925": 46260322304.0, + "1930": 46260322304.0, + "1935": 46260322304.0, + "1940": 46260322304.0, + "1945": 46260322304.0, + "1950": 46260322304.0, + "1955": 46260322304.0, + "1960": 46260322304.0, + "1965": 46260322304.0, + "1970": 46260322304.0, + "1975": 46261714944.0, + "1980": 46261714944.0, + "1985": 46261714944.0, + "1990": 46261714944.0, + "1995": 46261714944.0, + "2000": 46261714944.0, + "2005": 46261714944.0, + "2010": 46261714944.0, + "2015": 46261714944.0, + "2020": 46261714944.0, + "2025": 46261714944.0, + "2030": 46261714944.0, + "2035": 46261714944.0, + "2040": 46261714944.0, + "2045": 46261714944.0, + "2050": 46261714944.0, + "2055": 46261714944.0, + "2060": 46261714944.0, + "2065": 46261714944.0, + "2070": 46261714944.0, + "2075": 46261714944.0, + "2080": 46261714944.0, + "2085": 46261714944.0, + "2090": 46261714944.0, + "2095": 46261714944.0, + "2100": 46261714944.0, + "2105": 46261714944.0, + "2110": 46261714944.0, + "2115": 46261714944.0, + "2120": 46261714944.0, + "2125": 46261714944.0, + "2130": 46261714944.0, + "2135": 46261714944.0, + "2140": 46261714944.0, + "2145": 46261714944.0, + "2150": 46261714944.0, + "2155": 46261714944.0, + "2160": 46261714944.0, + "2165": 46261714944.0, + "2170": 46261714944.0, + "2175": 46261714944.0, + "2180": 46261714944.0, + "2185": 46261714944.0, + "2190": 46261714944.0, + "2195": 46261714944.0, + "2200": 46261714944.0, + "2205": 46261714944.0, + "2210": 46261714944.0, + "2215": 46261714944.0, + "2220": 46261714944.0, + "2225": 46261714944.0, + "2230": 46261714944.0, + "2235": 46261714944.0, + "2240": 46261714944.0, + "2245": 46261714944.0, + "2250": 46261714944.0, + "2255": 46261714944.0, + "2260": 46261714944.0, + "2265": 46261714944.0, + "2270": 46261714944.0, + "2275": 46261714944.0, + "2280": 46261714944.0, + "2285": 46261714944.0, + "2290": 46261714944.0, + "2295": 46261714944.0, + "2300": 46261714944.0, + "2305": 46261714944.0, + "2310": 46261714944.0, + "2315": 46261714944.0, + "2320": 46261714944.0, + "2325": 46261714944.0, + "2330": 46261714944.0, + "2335": 46261714944.0, + "2340": 46261714944.0, + "2345": 46261714944.0, + "2350": 46261714944.0, + "2355": 46261714944.0, + "2360": 46261714944.0, + "2365": 46261714944.0, + "2370": 46261714944.0, + "2375": 46261714944.0, + "2380": 46261714944.0, + "2385": 46261714944.0, + "2390": 46261714944.0, + "2395": 46261714944.0, + "2400": 46261714944.0, + "2405": 46261714944.0, + "2410": 46261714944.0, + "2415": 46261714944.0, + "2420": 46261714944.0, + "2425": 46261714944.0, + "2430": 46261714944.0, + "2435": 46261714944.0, + "2440": 46261714944.0, + "2445": 46261714944.0, + "2450": 46261714944.0, + "2455": 46261714944.0, + "2460": 46261714944.0, + "2465": 46261714944.0, + "2470": 46261714944.0, + "2475": 46261714944.0, + "2480": 46261714944.0, + "2485": 46261714944.0, + "2490": 46261714944.0, + "2495": 46261714944.0, + "2500": 46261714944.0, + "2505": 46261714944.0, + "2510": 46261714944.0, + "2515": 46261714944.0, + "2520": 46261714944.0, + "2525": 46261714944.0, + "2530": 46261714944.0, + "2535": 46261714944.0, + "2540": 46261714944.0, + "2545": 46261714944.0, + "2550": 46261714944.0, + "2555": 46261714944.0, + "2560": 46261714944.0, + "2565": 46261714944.0, + "2570": 46261714944.0, + "2575": 46261714944.0, + "2580": 46261714944.0, + "2585": 46261714944.0, + "2590": 46261714944.0, + "2595": 46261714944.0, + "2600": 46261714944.0, + "2605": 46261714944.0, + "2610": 46261714944.0, + "2615": 46261714944.0, + "2620": 46261714944.0, + "2625": 46261714944.0, + "2630": 46261714944.0, + "2635": 46261714944.0, + "2640": 46261714944.0, + "2645": 46261714944.0, + "2650": 46261714944.0, + "2655": 46261714944.0, + "2660": 46261714944.0, + "2665": 46261714944.0, + "2670": 46261714944.0, + "2675": 46261714944.0, + "2680": 46261714944.0, + "2685": 46261714944.0, + "2690": 46261714944.0, + "2695": 46261714944.0, + "2700": 46261714944.0, + "2705": 46261714944.0, + "2710": 46261714944.0, + "2715": 46261714944.0, + "2720": 46261714944.0, + "2725": 46261714944.0, + "2730": 46261714944.0, + "2735": 46261714944.0, + "2740": 46261714944.0, + "2745": 46261714944.0, + "2750": 46261714944.0, + "2755": 46261714944.0, + "2760": 46261714944.0, + "2765": 46261714944.0, + "2770": 46261714944.0, + "2775": 46261714944.0, + "2780": 46261714944.0, + "2785": 46261714944.0, + "2790": 46261714944.0, + "2795": 46261714944.0, + "2800": 46261714944.0, + "2805": 46261714944.0, + "2810": 46261714944.0, + "2815": 46261714944.0, + "2820": 46261714944.0, + "2825": 46261714944.0, + "2830": 46261714944.0, + "2835": 46261714944.0, + "2840": 46261714944.0, + "2845": 46261714944.0, + "2850": 46261714944.0, + "2855": 46261714944.0, + "2860": 46261714944.0, + "2865": 46261714944.0, + "2870": 46261714944.0, + "2875": 46261714944.0, + "2880": 46261714944.0, + "2885": 46261714944.0, + "2890": 46261714944.0, + "2895": 46261714944.0, + "2900": 46261714944.0, + "2905": 46261714944.0, + "2910": 46261714944.0, + "2915": 46261714944.0, + "2920": 46261714944.0, + "2925": 46261714944.0, + "2930": 46261714944.0, + "2935": 46261714944.0, + "2940": 46261714944.0, + "2945": 46261714944.0, + "2950": 46261714944.0, + "2955": 46261714944.0, + "2960": 46261714944.0, + "2965": 46261714944.0, + "2970": 46261714944.0, + "2975": 46261714944.0, + "2980": 46261714944.0, + "2985": 45706711040.0, + "2990": 45883699200.0, + "2995": 46072287232.0, + "3000": 46072287232.0, + "3005": 46072287232.0, + "3010": 46072287232.0, + "3015": 46072287232.0, + "3020": 46072287232.0, + "3025": 46072287232.0, + "3030": 46072287232.0, + "3035": 46072287232.0, + "3040": 46072287232.0, + "3045": 46072287232.0, + "3050": 46072287232.0, + "3055": 46072287232.0, + "3060": 46072287232.0, + "3065": 46072287232.0, + "3070": 46072287232.0, + "3075": 46072287232.0, + "3080": 46072287232.0, + "3085": 46072287232.0, + "3090": 46072287232.0, + "3095": 46072287232.0, + "3100": 46072287232.0, + "3105": 46072287232.0, + "3110": 46072287232.0, + "3115": 46072287232.0, + "3120": 46072287232.0, + "3125": 46072287232.0, + "3130": 46072287232.0, + "3135": 46072287232.0, + "3140": 46072287232.0, + "3145": 46072287232.0, + "3150": 46072287232.0, + "3155": 46072287232.0, + "3160": 46072287232.0, + "3165": 46072287232.0, + "3170": 46072287232.0, + "3175": 46072287232.0, + "3180": 46072287232.0, + "3185": 46072287232.0, + "3190": 46072287232.0, + "3195": 46072287232.0, + "3200": 46072287232.0, + "3205": 46072287232.0, + "3210": 46072287232.0, + "3215": 46072287232.0, + "3220": 46072287232.0, + "3225": 46072287232.0, + "3230": 46072287232.0, + "3235": 46072287232.0, + "3240": 46072287232.0, + "3245": 46072287232.0, + "3250": 46072287232.0, + "3255": 46072287232.0, + "3260": 46072287232.0, + "3265": 46072287232.0, + "3270": 46072287232.0, + "3275": 46072287232.0, + "3280": 46072287232.0, + "3285": 46072287232.0, + "3290": 46072287232.0, + "3295": 46072287232.0, + "3300": 46072287232.0, + "3305": 46072287232.0, + "3310": 46072287232.0, + "3315": 46072287232.0, + "3320": 46072287232.0, + "3325": 46072287232.0, + "3330": 46072287232.0, + "3335": 46072287232.0, + "3340": 46072287232.0, + "3345": 46072287232.0, + "3350": 46072287232.0, + "3355": 46072287232.0, + "3360": 46072287232.0, + "3365": 46072287232.0, + "3370": 46072287232.0, + "3375": 46072287232.0, + "3380": 46072287232.0, + "3385": 46072287232.0, + "3390": 46072287232.0, + "3395": 46072287232.0, + "3400": 46072287232.0, + "3405": 46072287232.0, + "3410": 46072287232.0, + "3415": 46072287232.0, + "3420": 46072287232.0, + "3425": 46072672256.0, + "3430": 46072672256.0, + "3435": 46072672256.0, + "3440": 46072672256.0, + "3445": 46072672256.0, + "3450": 46072672256.0, + "3455": 46072672256.0, + "3460": 46072672256.0, + "3465": 46072672256.0, + "3470": 46072672256.0, + "3475": 46072672256.0, + "3480": 46072672256.0, + "3485": 46095564800.0, + "3490": 46095564800.0, + "3495": 46095564800.0, + "3500": 46095564800.0, + "3505": 46095564800.0, + "3510": 46095564800.0, + "3515": 46095564800.0, + "3520": 46095564800.0, + "3525": 46095564800.0, + "3530": 46095564800.0, + "3535": 46095564800.0, + "3540": 46095564800.0, + "3545": 46095564800.0, + "3550": 46191697920.0, + "3555": 46191697920.0, + "3560": 46191697920.0, + "3565": 46191697920.0, + "3570": 46191697920.0, + "3575": 46191697920.0, + "3580": 46191697920.0, + "3585": 46191697920.0, + "3590": 46191697920.0, + "3595": 46191697920.0, + "3600": 46191697920.0, + "3605": 46191697920.0, + "3610": 46191697920.0, + "3615": 46191697920.0, + "3620": 46191697920.0, + "3625": 46191697920.0, + "3630": 46191697920.0, + "3635": 46191697920.0, + "3640": 46191697920.0, + "3645": 46191697920.0, + "3650": 46191697920.0, + "3655": 46191697920.0, + "3660": 46191697920.0, + "3665": 46191697920.0, + "3670": 46191697920.0, + "3675": 46191697920.0, + "3680": 46191697920.0, + "3685": 46191697920.0, + "3690": 46191697920.0, + "3695": 46191697920.0, + "3700": 46191697920.0, + "3705": 46191697920.0, + "3710": 46191697920.0, + "3715": 46191697920.0, + "3720": 46191697920.0, + "3725": 46191697920.0, + "3730": 46191697920.0, + "3735": 46191697920.0, + "3740": 46191697920.0, + "3745": 46191697920.0, + "3750": 46191697920.0, + "3755": 46191697920.0, + "3760": 46191697920.0, + "3765": 46191697920.0, + "3770": 46191697920.0, + "3775": 46191697920.0, + "3780": 46191697920.0, + "3785": 46191697920.0, + "3790": 46191697920.0, + "3795": 46191697920.0, + "3800": 46191697920.0, + "3805": 46191697920.0, + "3810": 46191697920.0, + "3815": 46191697920.0, + "3820": 46191697920.0, + "3825": 46191697920.0, + "3830": 46191697920.0, + "3835": 46191697920.0, + "3840": 46191697920.0, + "3845": 46191697920.0, + "3850": 46191697920.0, + "3855": 46191697920.0, + "3860": 46191697920.0, + "3865": 46191697920.0, + "3870": 46191697920.0, + "3875": 46191697920.0, + "3880": 46191697920.0, + "3885": 46191697920.0, + "3890": 46191697920.0, + "3895": 46191697920.0, + "3900": 46191697920.0, + "3905": 46191697920.0, + "3910": 46191697920.0, + "3915": 46191697920.0, + "3920": 46191697920.0, + "3925": 46191697920.0, + "3930": 46191697920.0, + "3935": 46191697920.0, + "3940": 46191697920.0, + "3945": 46191697920.0, + "3950": 46191697920.0, + "3955": 46191697920.0, + "3960": 46191697920.0, + "3965": 46191697920.0, + "3970": 46191697920.0, + "3975": 46191697920.0, + "3980": 46191697920.0, + "3985": 46191697920.0, + "3990": 46191697920.0, + "3995": 46191697920.0, + "4000": 45840449536.0, + "4005": 45869191168.0, + "4010": 45897973760.0, + "4015": 45897973760.0, + "4020": 45940301824.0, + "4025": 45940301824.0, + "4030": 45940301824.0, + "4035": 45940301824.0, + "4040": 45940301824.0, + "4045": 45940301824.0, + "4050": 45940301824.0, + "4055": 45940301824.0, + "4060": 45940301824.0, + "4065": 45940301824.0, + "4070": 45940301824.0, + "4075": 45940301824.0, + "4080": 45940301824.0, + "4085": 46009651200.0, + "4090": 46009651200.0, + "4095": 46009651200.0, + "4100": 46009651200.0, + "4105": 46009651200.0, + "4110": 46009651200.0, + "4115": 46009651200.0, + "4120": 46009651200.0, + "4125": 46009651200.0, + "4130": 46009651200.0, + "4135": 46009651200.0, + "4140": 46009651200.0, + "4145": 46009651200.0, + "4150": 46009651200.0, + "4155": 46009651200.0, + "4160": 46009651200.0, + "4165": 46009651200.0, + "4170": 46009651200.0, + "4175": 46009651200.0, + "4180": 46009651200.0, + "4185": 46009651200.0, + "4190": 46009651200.0, + "4195": 46009651200.0, + "4200": 46009651200.0, + "4205": 46009651200.0, + "4210": 46009651200.0, + "4215": 46009651200.0, + "4220": 46009651200.0, + "4225": 46064635904.0, + "4230": 46064635904.0, + "4235": 46064635904.0, + "4240": 46064635904.0, + "4245": 46064635904.0, + "4250": 46064635904.0, + "4255": 46064635904.0, + "4260": 46064635904.0, + "4265": 46064635904.0, + "4270": 46064635904.0, + "4275": 46064635904.0, + "4280": 46064635904.0, + "4285": 46064635904.0, + "4290": 46064635904.0, + "4295": 46064635904.0, + "4300": 46064635904.0, + "4305": 46064635904.0, + "4310": 46064635904.0, + "4315": 46064635904.0, + "4320": 46064635904.0, + "4325": 46064635904.0, + "4330": 46064635904.0, + "4335": 46064635904.0, + "4340": 46064635904.0, + "4345": 46064635904.0, + "4350": 46064635904.0, + "4355": 46064635904.0, + "4360": 46064635904.0, + "4365": 46064635904.0, + "4370": 46064635904.0, + "4375": 46064635904.0, + "4380": 46064635904.0, + "4385": 46064635904.0, + "4390": 46064635904.0, + "4395": 46064635904.0, + "4400": 46064635904.0, + "4405": 46064635904.0, + "4410": 46064635904.0, + "4415": 46064635904.0, + "4420": 46064635904.0, + "4425": 46064635904.0, + "4430": 46064635904.0, + "4435": 46064635904.0, + "4440": 46064635904.0, + "4445": 46064635904.0, + "4450": 46064635904.0, + "4455": 46064635904.0, + "4460": 46080573440.0, + "4465": 46080573440.0, + "4470": 46080573440.0, + "4475": 46080573440.0, + "4480": 46080573440.0, + "4485": 46080573440.0, + "4490": 46080573440.0, + "4495": 46080573440.0, + "4500": 46080573440.0, + "4505": 46080573440.0, + "4510": 46080573440.0, + "4515": 46080573440.0, + "4520": 46080573440.0, + "4525": 46080573440.0, + "4530": 46080573440.0, + "4535": 46080573440.0, + "4540": 46080573440.0, + "4545": 46080573440.0, + "4550": 46080573440.0, + "4555": 46080573440.0, + "4560": 46080573440.0, + "4565": 46080573440.0, + "4570": 46080573440.0, + "4575": 46080573440.0, + "4580": 46080573440.0, + "4585": 46080573440.0, + "4590": 46080573440.0, + "4595": 46080573440.0, + "4600": 46080573440.0, + "4605": 46080573440.0, + "4610": 46080573440.0, + "4615": 46343888896.0, + "4620": 46343888896.0, + "4625": 46343888896.0, + "4630": 46343888896.0, + "4635": 46343888896.0, + "4640": 46343888896.0, + "4645": 46343888896.0, + "4650": 46343888896.0, + "4655": 46343888896.0, + "4660": 46343888896.0, + "4665": 46343888896.0, + "4670": 46343888896.0, + "4675": 46343888896.0, + "4680": 46343888896.0, + "4685": 46343888896.0, + "4690": 46343888896.0, + "4695": 46343888896.0, + "4700": 46343888896.0, + "4705": 46343888896.0, + "4710": 46343888896.0, + "4715": 46343888896.0, + "4720": 46343888896.0, + "4725": 46343888896.0, + "4730": 46343888896.0, + "4735": 46343888896.0, + "4740": 46343888896.0, + "4745": 46343888896.0, + "4750": 46343888896.0, + "4755": 46343888896.0, + "4760": 46343888896.0, + "4765": 46343888896.0, + "4770": 46343888896.0, + "4775": 46343888896.0, + "4780": 46343888896.0, + "4785": 46343888896.0, + "4790": 46343888896.0, + "4795": 46343888896.0, + "4800": 46343888896.0, + "4805": 46343888896.0, + "4810": 46343888896.0, + "4815": 46343888896.0, + "4820": 46343888896.0, + "4825": 46343888896.0, + "4830": 46343888896.0, + "4835": 46343888896.0, + "4840": 46343888896.0, + "4845": 46343888896.0, + "4850": 46343888896.0, + "4855": 46343888896.0, + "4860": 46343888896.0, + "4865": 46343888896.0, + "4870": 46343888896.0, + "4875": 46343888896.0, + "4880": 46343888896.0, + "4885": 46343888896.0, + "4890": 46343888896.0, + "4895": 46343888896.0, + "4900": 46343888896.0, + "4905": 46343888896.0, + "4910": 46343888896.0, + "4915": 46343888896.0, + "4920": 46343888896.0, + "4925": 46343888896.0, + "4930": 46343888896.0, + "4935": 46343888896.0, + "4940": 46343888896.0, + "4945": 46343888896.0, + "4950": 46343888896.0, + "4955": 46343888896.0, + "4960": 46343888896.0, + "4965": 46343888896.0, + "4970": 46343888896.0, + "4975": 46343888896.0, + "4980": 46343888896.0, + "4985": 46343888896.0, + "4990": 46343888896.0, + "4995": 46343888896.0, + "5000": 46343888896.0, + "5005": 46199529472.0, + "5010": 46199529472.0, + "5015": 45764182016.0, + "5020": 45878784000.0, + "5025": 45878784000.0, + "5030": 45878784000.0, + "5035": 45878784000.0, + "5040": 45992685568.0, + "5045": 45992685568.0, + "5050": 45992685568.0, + "5055": 45992685568.0, + "5060": 45992685568.0, + "5065": 45992685568.0, + "5070": 45992685568.0, + "5075": 45992685568.0, + "5080": 45992685568.0, + "5085": 45992685568.0, + "5090": 45992685568.0, + "5095": 46014451712.0, + "5100": 46014451712.0, + "5105": 46014451712.0, + "5110": 46014451712.0, + "5115": 46014451712.0, + "5120": 46014451712.0, + "5125": 46014451712.0, + "5130": 46014451712.0, + "5135": 46014451712.0, + "5140": 46014451712.0, + "5145": 46014451712.0, + "5150": 46014451712.0, + "5155": 46014451712.0, + "5160": 46014451712.0, + "5165": 46014451712.0, + "5170": 46014451712.0, + "5175": 46014451712.0, + "5180": 46014451712.0, + "5185": 46014451712.0, + "5190": 46014451712.0, + "5195": 46014451712.0, + "5200": 46139572224.0, + "5205": 46139572224.0, + "5210": 46139572224.0, + "5215": 46139572224.0, + "5220": 46168403968.0, + "5225": 46168403968.0, + "5230": 46168403968.0, + "5235": 46168403968.0, + "5240": 46168403968.0, + "5245": 46168403968.0, + "5250": 46168403968.0, + "5255": 46168403968.0, + "5260": 46168403968.0, + "5265": 46168403968.0, + "5270": 46168403968.0, + "5275": 46168403968.0, + "5280": 46168403968.0, + "5285": 46168403968.0, + "5290": 46168403968.0, + "5295": 46168403968.0, + "5300": 46168403968.0, + "5305": 46168403968.0, + "5310": 46168403968.0, + "5315": 46168403968.0, + "5320": 46168403968.0, + "5325": 46168403968.0, + "5330": 46168403968.0, + "5335": 46168403968.0, + "5340": 46168403968.0, + "5345": 46168403968.0, + "5350": 46168403968.0, + "5355": 46168403968.0, + "5360": 46168403968.0, + "5365": 46168403968.0, + "5370": 46168403968.0, + "5375": 46168403968.0, + "5380": 46168403968.0, + "5385": 46168403968.0, + "5390": 46168403968.0, + "5395": 46168403968.0, + "5400": 46168403968.0, + "5405": 46168403968.0, + "5410": 46168403968.0, + "5415": 46168403968.0, + "5420": 46168403968.0, + "5425": 46168403968.0, + "5430": 46168403968.0, + "5435": 46168403968.0, + "5440": 46168403968.0, + "5445": 46168403968.0, + "5450": 46168403968.0, + "5455": 46168403968.0, + "5460": 46168403968.0, + "5465": 46168403968.0, + "5470": 46168403968.0, + "5475": 46168403968.0, + "5480": 46168403968.0, + "5485": 46168403968.0, + "5490": 46168403968.0, + "5495": 46168403968.0, + "5500": 46168403968.0, + "5505": 46168403968.0, + "5510": 46168403968.0, + "5515": 46168403968.0, + "5520": 46168403968.0, + "5525": 46168403968.0, + "5530": 46168403968.0, + "5535": 46168403968.0, + "5540": 46168403968.0, + "5545": 46168403968.0, + "5550": 46168403968.0, + "5555": 46168403968.0, + "5560": 46168403968.0, + "5565": 46168403968.0, + "5570": 46168403968.0, + "5575": 46168403968.0, + "5580": 46168403968.0, + "5585": 46168403968.0, + "5590": 46168403968.0, + "5595": 46168403968.0, + "5600": 46168403968.0, + "5605": 46226247680.0, + "5610": 46226247680.0, + "5615": 46226247680.0, + "5620": 46226247680.0, + "5625": 46226247680.0, + "5630": 46226247680.0, + "5635": 46226247680.0, + "5640": 46226247680.0, + "5645": 46226247680.0, + "5650": 46226247680.0, + "5655": 46226247680.0, + "5660": 46226247680.0, + "5665": 46226247680.0, + "5670": 46226247680.0, + "5675": 46226247680.0, + "5680": 46226247680.0, + "5685": 46226247680.0, + "5690": 46226247680.0, + "5695": 46226247680.0, + "5700": 46226247680.0, + "5705": 46226247680.0, + "5710": 46226247680.0, + "5715": 46226247680.0, + "5720": 46226247680.0, + "5725": 46226247680.0, + "5730": 46226247680.0, + "5735": 46226247680.0, + "5740": 46226247680.0, + "5745": 46226247680.0, + "5750": 46226247680.0, + "5755": 46226247680.0, + "5760": 46226247680.0, + "5765": 46226247680.0, + "5770": 46226247680.0, + "5775": 46226247680.0, + "5780": 46226247680.0, + "5785": 46226247680.0, + "5790": 46226247680.0, + "5795": 46226247680.0, + "5800": 46226247680.0, + "5805": 46226247680.0, + "5810": 46226247680.0, + "5815": 46226247680.0, + "5820": 46226247680.0, + "5825": 46226247680.0, + "5830": 46226247680.0, + "5835": 46226247680.0, + "5840": 46226247680.0, + "5845": 46226247680.0, + "5850": 46226247680.0, + "5855": 46226247680.0, + "5860": 46226247680.0, + "5865": 46226247680.0, + "5870": 46226247680.0, + "5875": 46226247680.0, + "5880": 46226247680.0, + "5885": 46226247680.0, + "5890": 46226247680.0, + "5895": 46226247680.0, + "5900": 46226247680.0, + "5905": 46226247680.0, + "5910": 46226247680.0, + "5915": 46226247680.0, + "5920": 46226247680.0, + "5925": 46226247680.0, + "5930": 46226247680.0, + "5935": 46226247680.0, + "5940": 46226247680.0, + "5945": 46226247680.0, + "5950": 46226247680.0, + "5955": 46226247680.0, + "5960": 46226247680.0, + "5965": 46226247680.0, + "5970": 46226247680.0, + "5975": 46226247680.0, + "5980": 46226247680.0, + "5985": 46226247680.0, + "5990": 46226247680.0, + "5995": 46226247680.0, + "6000": 46226247680.0, + "6005": 46226247680.0, + "6010": 46226247680.0, + "6015": 46226247680.0, + "6020": 46226247680.0, + "6025": 46226247680.0, + "6030": 45912186880.0, + "6035": 45912186880.0, + "6040": 45995683840.0, + "6045": 45995683840.0, + "6050": 45995683840.0, + "6055": 45995683840.0, + "6060": 45995683840.0, + "6065": 45995683840.0, + "6070": 45995683840.0, + "6075": 46014836736.0, + "6080": 46014836736.0, + "6085": 46014836736.0, + "6090": 46014836736.0, + "6095": 46014836736.0, + "6100": 46014836736.0, + "6105": 46014836736.0, + "6110": 46014836736.0, + "6115": 46014836736.0, + "6120": 46014836736.0, + "6125": 46014836736.0, + "6130": 46014836736.0, + "6135": 46014836736.0, + "6140": 46014836736.0, + "6145": 46014836736.0, + "6150": 46014836736.0, + "6155": 46014836736.0, + "6160": 46014836736.0, + "6165": 46025334784.0, + "6170": 46025334784.0, + "6175": 46025334784.0, + "6180": 46025334784.0, + "6185": 46035255296.0, + "6190": 46035255296.0, + "6195": 46035255296.0, + "6200": 46035255296.0, + "6205": 46035255296.0, + "6210": 46035255296.0, + "6215": 46035255296.0, + "6220": 46035255296.0, + "6225": 46035255296.0, + "6230": 46035255296.0, + "6235": 46035255296.0, + "6240": 46035255296.0, + "6245": 46035255296.0, + "6250": 46035255296.0, + "6255": 46035255296.0, + "6260": 46035255296.0, + "6265": 46035255296.0, + "6270": 46035255296.0, + "6275": 46035255296.0, + "6280": 46035255296.0, + "6285": 46035255296.0, + "6290": 46035255296.0, + "6295": 46035255296.0, + "6300": 46035255296.0, + "6305": 46035255296.0, + "6310": 46035255296.0, + "6315": 46035255296.0, + "6320": 46035255296.0, + "6325": 46035255296.0, + "6330": 46035255296.0, + "6335": 46035255296.0, + "6340": 46035255296.0, + "6345": 46035255296.0, + "6350": 46035255296.0, + "6355": 46035255296.0, + "6360": 46035255296.0, + "6365": 46035255296.0, + "6370": 46035255296.0, + "6375": 46035255296.0, + "6380": 46035255296.0, + "6385": 46035255296.0, + "6390": 46035255296.0, + "6395": 46035255296.0, + "6400": 46035255296.0, + "6405": 46035255296.0, + "6410": 46035255296.0, + "6415": 46035255296.0, + "6420": 46035255296.0, + "6425": 46035255296.0, + "6430": 46035255296.0, + "6435": 46035255296.0, + "6440": 46035255296.0, + "6445": 46035255296.0, + "6450": 46035255296.0, + "6455": 46035255296.0, + "6460": 46035255296.0, + "6465": 46035255296.0, + "6470": 46035255296.0, + "6475": 46035255296.0, + "6480": 46035255296.0, + "6485": 46035255296.0, + "6490": 46035255296.0, + "6495": 46035255296.0, + "6500": 46035255296.0, + "6505": 46064041984.0, + "6510": 46064041984.0, + "6515": 46064041984.0, + "6520": 46064041984.0, + "6525": 46064041984.0, + "6530": 46064041984.0, + "6535": 46064041984.0, + "6540": 46064041984.0, + "6545": 46064041984.0, + "6550": 46064041984.0, + "6555": 46064041984.0, + "6560": 46064041984.0, + "6565": 46064041984.0, + "6570": 46064041984.0, + "6575": 46064041984.0, + "6580": 46064041984.0, + "6585": 46064041984.0, + "6590": 46064041984.0, + "6595": 46064041984.0, + "6600": 46064041984.0, + "6605": 46064041984.0, + "6610": 46064041984.0, + "6615": 46064041984.0, + "6620": 46064041984.0, + "6625": 46064041984.0, + "6630": 46064041984.0, + "6635": 46064041984.0, + "6640": 46064041984.0, + "6645": 46064041984.0, + "6650": 46064041984.0, + "6655": 46064041984.0, + "6660": 46064041984.0, + "6665": 46064041984.0, + "6670": 46064041984.0, + "6675": 46064041984.0, + "6680": 46064041984.0, + "6685": 46064041984.0, + "6690": 46064041984.0, + "6695": 46064041984.0, + "6700": 46064041984.0, + "6705": 46064041984.0, + "6710": 46064041984.0, + "6715": 46064041984.0, + "6720": 46064041984.0, + "6725": 46064041984.0, + "6730": 46064041984.0, + "6735": 46064041984.0, + "6740": 46064041984.0, + "6745": 46064041984.0, + "6750": 46064041984.0, + "6755": 46064041984.0, + "6760": 46064041984.0, + "6765": 46064041984.0, + "6770": 46064041984.0, + "6775": 46064041984.0, + "6780": 46064041984.0, + "6785": 46064041984.0, + "6790": 46064041984.0, + "6795": 46064041984.0, + "6800": 46064041984.0, + "6805": 46064041984.0, + "6810": 46064041984.0, + "6815": 46064041984.0, + "6820": 46064041984.0, + "6825": 46064041984.0, + "6830": 46064041984.0, + "6835": 46064041984.0, + "6840": 46064041984.0, + "6845": 46064041984.0, + "6850": 46064041984.0, + "6855": 46064041984.0, + "6860": 46064041984.0, + "6865": 46064041984.0, + "6870": 46064041984.0, + "6875": 46064041984.0, + "6880": 46064041984.0, + "6885": 46064041984.0, + "6890": 46064041984.0, + "6895": 46064041984.0, + "6900": 46064041984.0, + "6905": 46064041984.0, + "6910": 46064041984.0, + "6915": 46064041984.0, + "6920": 46064041984.0, + "6925": 46064041984.0, + "6930": 46064041984.0, + "6935": 46064041984.0, + "6940": 46064041984.0, + "6945": 46064041984.0, + "6950": 46064041984.0, + "6955": 46064041984.0, + "6960": 46064041984.0, + "6965": 46064041984.0, + "6970": 46064041984.0, + "6975": 46064041984.0, + "6980": 46064041984.0, + "6985": 46064041984.0, + "6990": 46064041984.0, + "6995": 46064041984.0, + "7000": 46064041984.0, + "7005": 46064041984.0, + "7010": 46064041984.0, + "7015": 46064041984.0, + "7020": 46064041984.0, + "7025": 46064041984.0, + "7030": 46108979200.0, + "7035": 46108979200.0, + "7040": 46108979200.0, + "7045": 46108979200.0, + "7050": 46065532928.0, + "7055": 46065532928.0, + "7060": 46065532928.0, + "7065": 46065532928.0, + "7070": 46065532928.0, + "7075": 46065532928.0, + "7080": 46065532928.0, + "7085": 46065532928.0, + "7090": 46065532928.0, + "7095": 46065532928.0, + "7100": 46065532928.0, + "7105": 46065532928.0, + "7110": 46065532928.0, + "7115": 46065532928.0, + "7120": 46065532928.0, + "7125": 46065532928.0, + "7130": 46065532928.0, + "7135": 46065532928.0, + "7140": 46065532928.0, + "7145": 46065532928.0, + "7150": 46065532928.0, + "7155": 46065532928.0, + "7160": 46065532928.0, + "7165": 46065532928.0, + "7170": 46065532928.0, + "7175": 46065532928.0, + "7180": 46065532928.0, + "7185": 46065532928.0, + "7190": 46065532928.0, + "7195": 46065532928.0, + "7200": 46065532928.0, + "7205": 46065532928.0, + "7210": 46065532928.0, + "7215": 46065532928.0, + "7220": 46065532928.0, + "7225": 46065532928.0, + "7230": 46065532928.0, + "7235": 46065532928.0, + "7240": 46065532928.0, + "7245": 46065532928.0, + "7250": 46065532928.0, + "7255": 46065532928.0, + "7260": 46065532928.0, + "7265": 46065532928.0, + "7270": 46065532928.0, + "7275": 46065532928.0, + "7280": 46065532928.0, + "7285": 46065532928.0, + "7290": 46065532928.0, + "7295": 46065532928.0, + "7300": 46065532928.0, + "7305": 46065532928.0, + "7310": 46065532928.0, + "7315": 46065532928.0, + "7320": 46065532928.0, + "7325": 46065532928.0, + "7330": 46065532928.0, + "7335": 46065532928.0, + "7340": 46065532928.0, + "7345": 46065532928.0, + "7350": 46065532928.0, + "7355": 46065532928.0, + "7360": 46065532928.0, + "7365": 46065532928.0, + "7370": 46065532928.0, + "7375": 46065532928.0, + "7380": 46065532928.0, + "7385": 46065532928.0, + "7390": 46065532928.0, + "7395": 46065532928.0, + "7400": 46065532928.0, + "7405": 46065532928.0, + "7410": 46065532928.0, + "7415": 46065532928.0, + "7420": 46065532928.0, + "7425": 46065532928.0, + "7430": 46065532928.0, + "7435": 46065532928.0, + "7440": 46065532928.0, + "7445": 46065532928.0, + "7450": 46065532928.0, + "7455": 46065532928.0, + "7460": 46065532928.0, + "7465": 46065532928.0, + "7470": 46065532928.0, + "7475": 46065532928.0, + "7480": 46065532928.0, + "7485": 46065532928.0, + "7490": 46065532928.0, + "7495": 46065532928.0, + "7500": 46065532928.0, + "7505": 46065532928.0, + "7510": 46065532928.0, + "7515": 46065532928.0, + "7520": 45618061312.0, + "7525": 45747933184.0, + "7530": 45825024000.0, + "7535": 45825024000.0, + "7540": 45825024000.0, + "7545": 45910597632.0, + "7550": 45910597632.0, + "7555": 45910597632.0, + "7560": 45910597632.0, + "7565": 45910597632.0, + "7570": 45910597632.0, + "7575": 45910597632.0, + "7580": 45910597632.0, + "7585": 45910597632.0, + "7590": 45910597632.0, + "7595": 45916950528.0, + "7600": 45924253696.0, + "7605": 45924253696.0, + "7610": 45924253696.0, + "7615": 45924253696.0, + "7620": 45924253696.0, + "7625": 45924253696.0, + "7630": 45924253696.0, + "7635": 45924253696.0, + "7640": 45924253696.0, + "7645": 45944950784.0, + "7650": 45944950784.0, + "7655": 45944950784.0, + "7660": 45944950784.0, + "7665": 45944950784.0, + "7670": 45944950784.0, + "7675": 45944950784.0, + "7680": 45944950784.0, + "7685": 45944950784.0, + "7690": 45944950784.0, + "7695": 45944950784.0, + "7700": 45944950784.0, + "7705": 45944950784.0, + "7710": 45944950784.0, + "7715": 45944950784.0, + "7720": 45944950784.0, + "7725": 45944950784.0, + "7730": 45944950784.0, + "7735": 45944950784.0, + "7740": 45944950784.0, + "7745": 45944950784.0, + "7750": 45944950784.0, + "7755": 45944950784.0, + "7760": 45944950784.0, + "7765": 45944950784.0, + "7770": 45944950784.0, + "7775": 45944950784.0, + "7780": 45944950784.0, + "7785": 45944950784.0, + "7790": 45944950784.0, + "7795": 45944950784.0, + "7800": 45944950784.0, + "7805": 45944950784.0, + "7810": 45944950784.0, + "7815": 45944950784.0, + "7820": 45944950784.0, + "7825": 45944950784.0, + "7830": 45944950784.0, + "7835": 45944950784.0, + "7840": 45973135360.0, + "7845": 45973135360.0, + "7850": 46089904128.0, + "7855": 46089904128.0, + "7860": 46089904128.0, + "7865": 46089904128.0, + "7870": 46089904128.0, + "7875": 46089904128.0, + "7880": 46089904128.0, + "7885": 46089904128.0, + "7890": 46089904128.0, + "7895": 46089904128.0, + "7900": 46089904128.0, + "7905": 46089904128.0, + "7910": 46089904128.0, + "7915": 46089904128.0, + "7920": 46089904128.0, + "7925": 46089904128.0, + "7930": 46089904128.0, + "7935": 46089904128.0, + "7940": 46089904128.0, + "7945": 46089904128.0, + "7950": 46089904128.0, + "7955": 46089904128.0, + "7960": 46089904128.0, + "7965": 46089904128.0, + "7970": 46089904128.0, + "7975": 46089904128.0, + "7980": 46089904128.0, + "7985": 46089904128.0, + "7990": 46089904128.0, + "7995": 46089904128.0, + "8000": 46089904128.0, + "8005": 46089904128.0, + "8010": 46089904128.0, + "8015": 46089904128.0, + "8020": 46089904128.0, + "8025": 46089904128.0, + "8030": 46089904128.0, + "8035": 46089904128.0, + "8040": 46089904128.0, + "8045": 46089904128.0, + "8050": 46089904128.0, + "8055": 46089904128.0, + "8060": 46089904128.0, + "8065": 46089904128.0, + "8070": 46089904128.0, + "8075": 46089904128.0, + "8080": 46089904128.0, + "8085": 46089904128.0, + "8090": 46089904128.0, + "8095": 46089904128.0, + "8100": 46089904128.0, + "8105": 46089904128.0, + "8110": 46089904128.0, + "8115": 46089904128.0, + "8120": 46089904128.0, + "8125": 46089904128.0, + "8130": 46089904128.0, + "8135": 46089904128.0, + "8140": 46089904128.0, + "8145": 46089904128.0, + "8150": 46089904128.0, + "8155": 46089904128.0, + "8160": 46089904128.0, + "8165": 46089904128.0, + "8170": 46089904128.0, + "8175": 46089904128.0, + "8180": 46089904128.0, + "8185": 46089904128.0, + "8190": 46089904128.0, + "8195": 46089904128.0, + "8200": 46089904128.0, + "8205": 46089904128.0, + "8210": 46089904128.0, + "8215": 46089904128.0, + "8220": 46089904128.0, + "8225": 46089904128.0, + "8230": 46089904128.0, + "8235": 46089904128.0, + "8240": 46089904128.0, + "8245": 46089904128.0, + "8250": 46089904128.0, + "8255": 46089904128.0, + "8260": 46089904128.0, + "8265": 46089904128.0, + "8270": 46089904128.0, + "8275": 46089904128.0, + "8280": 46089904128.0, + "8285": 46089904128.0, + "8290": 46089904128.0, + "8295": 46089904128.0, + "8300": 46089904128.0, + "8305": 46089904128.0, + "8310": 46089904128.0, + "8315": 46089904128.0, + "8320": 46089904128.0, + "8325": 46089904128.0, + "8330": 46089904128.0, + "8335": 46089904128.0, + "8340": 46089904128.0, + "8345": 46089904128.0, + "8350": 46089904128.0, + "8355": 46089904128.0, + "8360": 46089904128.0, + "8365": 46089904128.0, + "8370": 46089904128.0, + "8375": 46089904128.0, + "8380": 46089904128.0, + "8385": 46089904128.0, + "8390": 46089904128.0, + "8395": 46089904128.0, + "8400": 46089904128.0, + "8405": 46089904128.0, + "8410": 46089904128.0, + "8415": 46089904128.0, + "8420": 46089904128.0, + "8425": 46089904128.0, + "8430": 46089904128.0, + "8435": 46089904128.0, + "8440": 46089904128.0, + "8445": 46089904128.0, + "8450": 46089904128.0, + "8455": 46089904128.0, + "8460": 46089904128.0, + "8465": 46089904128.0, + "8470": 46089904128.0, + "8475": 46089904128.0, + "8480": 46089904128.0, + "8485": 46089904128.0, + "8490": 46089904128.0, + "8495": 46089904128.0, + "8500": 46089904128.0, + "8505": 46089904128.0, + "8510": 46089904128.0, + "8515": 46089904128.0, + "8520": 46089904128.0, + "8525": 46089904128.0, + "8530": 45938114560.0, + "8535": 45938114560.0, + "8540": 45938114560.0, + "8545": 45938114560.0, + "8550": 45938114560.0, + "8555": 45938114560.0, + "8560": 45938114560.0, + "8565": 45938114560.0, + "8570": 45938114560.0, + "8575": 45938114560.0, + "8580": 45938114560.0, + "8585": 45938114560.0, + "8590": 45950377984.0, + "8595": 45950377984.0, + "8600": 45950377984.0, + "8605": 45950377984.0, + "8610": 45950377984.0, + "8615": 45950377984.0, + "8620": 45950377984.0, + "8625": 45950377984.0, + "8630": 45950377984.0, + "8635": 45950377984.0, + "8640": 45950377984.0, + "8645": 45950377984.0, + "8650": 45950377984.0, + "8655": 45950377984.0, + "8660": 45950377984.0, + "8665": 45950377984.0, + "8670": 45955510272.0, + "8675": 45955510272.0, + "8680": 45955510272.0, + "8685": 45955510272.0, + "8690": 45991550976.0, + "8695": 45991550976.0, + "8700": 45991550976.0, + "8705": 45991550976.0, + "8710": 45991550976.0, + "8715": 45991550976.0, + "8720": 45991550976.0, + "8725": 45991550976.0, + "8730": 45991550976.0, + "8735": 45991550976.0, + "8740": 46068584448.0, + "8745": 46068584448.0, + "8750": 46068584448.0, + "8755": 46068584448.0, + "8760": 46068584448.0, + "8765": 46068584448.0, + "8770": 46068584448.0, + "8775": 46068584448.0, + "8780": 46068584448.0, + "8785": 46068584448.0, + "8790": 46068584448.0, + "8795": 46068584448.0, + "8800": 46068584448.0, + "8805": 46068584448.0, + "8810": 46068584448.0, + "8815": 46068584448.0, + "8820": 46068584448.0, + "8825": 46068584448.0, + "8830": 46068584448.0, + "8835": 46068584448.0, + "8840": 46068584448.0, + "8845": 46068584448.0, + "8850": 46068584448.0, + "8855": 46184767488.0, + "8860": 46184767488.0, + "8865": 46184767488.0, + "8870": 46184767488.0, + "8875": 46184767488.0, + "8880": 46184767488.0, + "8885": 46184767488.0, + "8890": 46184767488.0, + "8895": 46184767488.0, + "8900": 46184767488.0, + "8905": 46184767488.0, + "8910": 46184767488.0, + "8915": 46184767488.0, + "8920": 46184767488.0, + "8925": 46184767488.0, + "8930": 46184767488.0, + "8935": 46184767488.0, + "8940": 46184767488.0, + "8945": 46184767488.0, + "8950": 46184767488.0, + "8955": 46184767488.0, + "8960": 46184767488.0, + "8965": 46184767488.0, + "8970": 46184767488.0, + "8975": 46184767488.0, + "8980": 46184767488.0, + "8985": 46184767488.0, + "8990": 46184767488.0, + "8995": 46184767488.0, + "9000": 46184767488.0, + "9005": 46184767488.0, + "9010": 46184767488.0, + "9015": 46184767488.0, + "9020": 46184767488.0, + "9025": 46184767488.0, + "9030": 46184767488.0, + "9035": 46184767488.0, + "9040": 46184767488.0, + "9045": 46184767488.0, + "9050": 46184767488.0, + "9055": 46184767488.0, + "9060": 46184767488.0, + "9065": 46184767488.0, + "9070": 46184767488.0, + "9075": 46184767488.0, + "9080": 46184767488.0, + "9085": 46184767488.0, + "9090": 46184767488.0, + "9095": 46184767488.0, + "9100": 46184767488.0, + "9105": 46184767488.0, + "9110": 46184767488.0, + "9115": 46184767488.0, + "9120": 46184767488.0, + "9125": 46184767488.0, + "9130": 46184767488.0, + "9135": 46184767488.0, + "9140": 46184767488.0, + "9145": 46184767488.0, + "9150": 46184767488.0, + "9155": 46184767488.0, + "9160": 46184767488.0, + "9165": 46184767488.0, + "9170": 46184767488.0, + "9175": 46184767488.0, + "9180": 46184767488.0, + "9185": 46184767488.0, + "9190": 46184767488.0, + "9195": 46184767488.0, + "9200": 46184767488.0, + "9205": 46184767488.0, + "9210": 46184767488.0, + "9215": 46184767488.0, + "9220": 46184767488.0, + "9225": 46184767488.0, + "9230": 46184767488.0, + "9235": 46184767488.0, + "9240": 46184767488.0, + "9245": 46184767488.0, + "9250": 46184767488.0, + "9255": 46184767488.0, + "9260": 46184767488.0, + "9265": 46184767488.0, + "9270": 46184767488.0, + "9275": 46184767488.0, + "9280": 46184767488.0, + "9285": 46184767488.0, + "9290": 46184767488.0, + "9295": 46184767488.0, + "9300": 46184767488.0, + "9305": 46184767488.0, + "9310": 46184767488.0, + "9315": 46184767488.0, + "9320": 46184767488.0, + "9325": 46184767488.0, + "9330": 46184767488.0, + "9335": 46184767488.0, + "9340": 46184767488.0, + "9345": 46184767488.0, + "9350": 46184767488.0, + "9355": 46184767488.0, + "9360": 46184767488.0, + "9365": 46184767488.0, + "9370": 46184767488.0, + "9375": 46184767488.0, + "9380": 46184767488.0, + "9385": 46184767488.0, + "9390": 46184767488.0, + "9395": 46184767488.0, + "9400": 46184767488.0, + "9405": 46184767488.0, + "9410": 46184767488.0, + "9415": 46184767488.0, + "9420": 46184767488.0, + "9425": 46184767488.0, + "9430": 46184767488.0, + "9435": 46184767488.0, + "9440": 46184767488.0, + "9445": 46184767488.0, + "9450": 46184767488.0, + "9455": 46184767488.0, + "9460": 46184767488.0, + "9465": 46184767488.0, + "9470": 46184767488.0, + "9475": 46184767488.0, + "9480": 46184767488.0, + "9485": 46184767488.0, + "9490": 46184767488.0, + "9495": 46184767488.0, + "9500": 46184767488.0, + "9505": 46184767488.0, + "9510": 46184767488.0, + "9515": 46184767488.0, + "9520": 46184767488.0, + "9525": 46184767488.0, + "9530": 46184767488.0, + "9535": 46184767488.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 13.88878, + "5": 13.88979, + "10": 13.88767, + "15": 13.88576, + "20": 13.88068, + "25": 13.87774, + "30": 13.85566, + "35": 13.84855, + "40": 13.84546, + "45": 13.82693, + "50": 13.74828, + "55": 13.7249, + "60": 13.70841, + "65": 13.67571, + "70": 13.63981, + "75": 13.44327, + "80": 13.36054, + "85": 13.2835, + "90": 13.18641, + "95": 13.0505, + "100": 12.90733, + "105": 12.74689, + "110": 12.48525, + "115": 12.26801, + "120": 12.04358, + "125": 11.87011, + "130": 11.74911, + "135": 11.5841, + "140": 11.3494, + "145": 11.26997, + "150": 11.11919, + "155": 11.0211, + "160": 10.88133, + "165": 10.75162, + "170": 10.65694, + "175": 10.59566, + "180": 10.43546, + "185": 10.42441, + "190": 10.27183, + "195": 10.2539, + "200": 10.12718, + "205": 9.97472, + "210": 9.94271, + "215": 9.92122, + "220": 9.78944, + "225": 9.77014, + "230": 9.73, + "235": 9.64372, + "240": 9.57366, + "245": 9.50499, + "250": 9.43776, + "255": 9.37037, + "260": 9.29579, + "265": 9.2411, + "270": 9.15629, + "275": 9.12851, + "280": 9.10516, + "285": 9.09815, + "290": 9.01068, + "295": 8.94828, + "300": 8.83207, + "305": 8.80663, + "310": 8.74389, + "315": 8.71813, + "320": 8.68425, + "325": 8.58706, + "330": 8.56208, + "335": 8.53307, + "340": 8.52937, + "345": 8.41091, + "350": 8.39973, + "355": 8.29759, + "360": 8.38348, + "365": 8.28981, + "370": 8.2833, + "375": 8.22588, + "380": 8.18359, + "385": 8.16998, + "390": 8.1467, + "395": 8.09789, + "400": 8.01583, + "405": 8.01349, + "410": 8.00377, + "415": 7.95012, + "420": 7.93109, + "425": 7.88677, + "430": 7.81895, + "435": 7.82989, + "440": 7.77278, + "445": 7.7493, + "450": 7.67877, + "455": 7.7063, + "460": 7.6532, + "465": 7.6329, + "470": 7.59885, + "475": 7.61277, + "480": 7.48436, + "485": 7.53153, + "490": 7.48574, + "495": 7.4714, + "500": 7.41282, + "505": 7.41932, + "510": 7.38698, + "515": 7.35645, + "520": 7.35102, + "525": 7.32559, + "530": 7.32588, + "535": 7.30357, + "540": 7.2179, + "545": 7.24022, + "550": 7.27618, + "555": 7.30238, + "560": 7.23984, + "565": 7.16321, + "570": 7.17228, + "575": 7.18898, + "580": 7.11497, + "585": 7.11901, + "590": 7.06121, + "595": 7.04317, + "600": 7.06682, + "605": 7.06137, + "610": 7.01939, + "615": 7.078, + "620": 6.98113, + "625": 6.95612, + "630": 6.96104, + "635": 6.98871, + "640": 6.96819, + "645": 6.95817, + "650": 7.00625, + "655": 7.00242, + "660": 6.89823, + "665": 6.88159, + "670": 6.84888, + "675": 6.93827, + "680": 6.89638, + "685": 6.85679, + "690": 6.83445, + "695": 6.79719, + "700": 6.79183, + "705": 6.78625, + "710": 6.82275, + "715": 6.82665, + "720": 6.71137, + "725": 6.76643, + "730": 6.75579, + "735": 6.75515, + "740": 6.70045, + "745": 6.67565, + "750": 6.73564, + "755": 6.65767, + "760": 6.66496, + "765": 6.65951, + "770": 6.68075, + "775": 6.65453, + "780": 6.62427, + "785": 6.64321, + "790": 6.59399, + "795": 6.59812, + "800": 6.5878, + "805": 6.65391, + "810": 6.51946, + "815": 6.5419, + "820": 6.55134, + "825": 6.55855, + "830": 6.57041, + "835": 6.52603, + "840": 6.49033, + "845": 6.54438, + "850": 6.49874, + "855": 6.49335, + "860": 6.49024, + "865": 6.49642, + "870": 6.46222, + "875": 6.51054, + "880": 6.4748, + "885": 6.43786, + "890": 6.51246, + "895": 6.39629, + "900": 6.41895, + "905": 6.44341, + "910": 6.40617, + "915": 6.38978, + "920": 6.38772, + "925": 6.37391, + "930": 6.40825, + "935": 6.39755, + "940": 6.34172, + "945": 6.36869, + "950": 6.3953, + "955": 6.34893, + "960": 6.35406, + "965": 6.25416, + "970": 6.32381, + "975": 6.31262, + "980": 6.28797, + "985": 6.29222, + "990": 6.34527, + "995": 6.26326, + "1000": 6.28434, + "1005": 6.23155, + "1010": 6.26712, + "1015": 6.29352, + "1020": 6.20454, + "1025": 6.21082, + "1030": 6.20913, + "1035": 6.29924, + "1040": 6.22531, + "1045": 6.19943, + "1050": 6.2267, + "1055": 6.21777, + "1060": 6.1673, + "1065": 6.15758, + "1070": 6.19281, + "1075": 6.19093, + "1080": 6.19319, + "1085": 6.19606, + "1090": 6.17796, + "1095": 6.181, + "1100": 6.1397, + "1105": 6.11513, + "1110": 6.17787, + "1115": 6.11231, + "1120": 6.05286, + "1125": 6.08699, + "1130": 6.14167, + "1135": 6.09531, + "1140": 6.08221, + "1145": 6.06731, + "1150": 6.09458, + "1155": 6.06298, + "1160": 6.04607, + "1165": 6.09676, + "1170": 6.07336, + "1175": 6.04568, + "1180": 6.05058, + "1185": 6.04124, + "1190": 6.04961, + "1195": 6.02949, + "1200": 5.97329, + "1205": 6.07601, + "1210": 5.93751, + "1215": 5.98403, + "1220": 6.06306, + "1225": 5.95152, + "1230": 5.99877, + "1235": 5.95912, + "1240": 5.99322, + "1245": 5.97187, + "1250": 5.95299, + "1255": 5.94742, + "1260": 5.95227, + "1265": 5.93352, + "1270": 5.90818, + "1275": 5.96805, + "1280": 5.90416, + "1285": 5.92308, + "1290": 5.90725, + "1295": 5.92, + "1300": 5.9267, + "1305": 5.90057, + "1310": 5.83908, + "1315": 5.8992, + "1320": 5.89614, + "1325": 5.8271, + "1330": 5.88462, + "1335": 5.8531, + "1340": 5.91994, + "1345": 5.86667, + "1350": 5.84738, + "1355": 5.84415, + "1360": 5.85216, + "1365": 5.84478, + "1370": 5.79663, + "1375": 5.80667, + "1380": 5.86219, + "1385": 5.81826, + "1390": 5.81231, + "1395": 5.8299, + "1400": 5.83135, + "1405": 5.82032, + "1410": 5.78518, + "1415": 5.77017, + "1420": 5.8049, + "1425": 5.79565, + "1430": 5.83189, + "1435": 5.74562, + "1440": 5.76408, + "1445": 5.8071, + "1450": 5.78859, + "1455": 5.80534, + "1460": 5.75975, + "1465": 5.76379, + "1470": 5.8044, + "1475": 5.76985, + "1480": 5.77563, + "1485": 5.72396, + "1490": 5.72354, + "1495": 5.74538, + "1500": 5.75109, + "1505": 5.72321, + "1510": 5.74832, + "1515": 5.67052, + "1520": 5.70302, + "1525": 5.67385, + "1530": 5.69497, + "1535": 5.68565, + "1540": 5.672, + "1545": 5.7178, + "1550": 5.72274, + "1555": 5.70942, + "1560": 5.65211, + "1565": 5.69926, + "1570": 5.71179, + "1575": 5.6613, + "1580": 5.69275, + "1585": 5.67221, + "1590": 5.66087, + "1595": 5.63673, + "1600": 5.70849, + "1605": 5.64113, + "1610": 5.64353, + "1615": 5.63334, + "1620": 5.65496, + "1625": 5.64982, + "1630": 5.62727, + "1635": 5.67706, + "1640": 5.62761, + "1645": 5.6449, + "1650": 5.63803, + "1655": 5.62499, + "1660": 5.61278, + "1665": 5.60116, + "1670": 5.61214, + "1675": 5.62193, + "1680": 5.56155, + "1685": 5.57098, + "1690": 5.55098, + "1695": 5.55521, + "1700": 5.60178, + "1705": 5.57706, + "1710": 5.58407, + "1715": 5.54721, + "1720": 5.52704, + "1725": 5.56718, + "1730": 5.53148, + "1735": 5.58307, + "1740": 5.52337, + "1745": 5.55772, + "1750": 5.53213, + "1755": 5.5301, + "1760": 5.55304, + "1765": 5.5132, + "1770": 5.522, + "1775": 5.52704, + "1780": 5.53997, + "1785": 5.48896, + "1790": 5.52187, + "1795": 5.52448, + "1800": 5.4698, + "1805": 5.46326, + "1810": 5.47869, + "1815": 5.48464, + "1820": 5.48466, + "1825": 5.48352, + "1830": 5.46909, + "1835": 5.46355, + "1840": 5.46633, + "1845": 5.44723, + "1850": 5.42996, + "1855": 5.4834, + "1860": 5.43502, + "1865": 5.44302, + "1870": 5.43258, + "1875": 5.42823, + "1880": 5.491, + "1885": 5.45039, + "1890": 5.44132, + "1895": 5.38084, + "1900": 5.42123, + "1905": 5.41299, + "1910": 5.43539, + "1915": 5.4013, + "1920": 5.37729, + "1925": 5.4085, + "1930": 5.37579, + "1935": 5.39731, + "1940": 5.3727, + "1945": 5.4174, + "1950": 5.45899, + "1955": 5.39197, + "1960": 5.39342, + "1965": 5.34213, + "1970": 5.34023, + "1975": 5.40413, + "1980": 5.35398, + "1985": 5.37376, + "1990": 5.39658, + "1995": 5.37398, + "2000": 5.38469, + "2005": 5.42838, + "2010": 5.32884, + "2015": 5.32047, + "2020": 5.32991, + "2025": 5.37403, + "2030": 5.31228, + "2035": 5.33119, + "2040": 5.29466, + "2045": 5.38332, + "2050": 5.35716, + "2055": 5.33062, + "2060": 5.32903, + "2065": 5.29751, + "2070": 5.29985, + "2075": 5.32708, + "2080": 5.29709, + "2085": 5.32918, + "2090": 5.24905, + "2095": 5.29587, + "2100": 5.25777, + "2105": 5.28625, + "2110": 5.28042, + "2115": 5.28102, + "2120": 5.2839, + "2125": 5.24699, + "2130": 5.25602, + "2135": 5.25599, + "2140": 5.26607, + "2145": 5.22772, + "2150": 5.24774, + "2155": 5.22588, + "2160": 5.24123, + "2165": 5.22937, + "2170": 5.26626, + "2175": 5.2603, + "2180": 5.24294, + "2185": 5.24675, + "2190": 5.22691, + "2195": 5.20127, + "2200": 5.20409, + "2205": 5.2127, + "2210": 5.25738, + "2215": 5.30103, + "2220": 5.24446, + "2225": 5.2194, + "2230": 5.21789, + "2235": 5.25766, + "2240": 5.16329, + "2245": 5.1607, + "2250": 5.18607, + "2255": 5.19635, + "2260": 5.13701, + "2265": 5.21276, + "2270": 5.14278, + "2275": 5.19722, + "2280": 5.17159, + "2285": 5.18798, + "2290": 5.17456, + "2295": 5.18141, + "2300": 5.17912, + "2305": 5.15551, + "2310": 5.1834, + "2315": 5.12144, + "2320": 5.17039, + "2325": 5.14984, + "2330": 5.15156, + "2335": 5.13195, + "2340": 5.13852, + "2345": 5.18732, + "2350": 5.12945, + "2355": 5.11891, + "2360": 5.10445, + "2365": 5.11898, + "2370": 5.10258, + "2375": 5.11122, + "2380": 5.05395, + "2385": 5.09747, + "2390": 5.11702, + "2395": 5.1322, + "2400": 5.07944, + "2405": 5.06236, + "2410": 5.11554, + "2415": 5.09106, + "2420": 5.10878, + "2425": 5.06863, + "2430": 5.09273, + "2435": 5.08666, + "2440": 5.07515, + "2445": 5.08608, + "2450": 5.04943, + "2455": 5.09523, + "2460": 5.04536, + "2465": 5.08334, + "2470": 5.07644, + "2475": 5.11246, + "2480": 5.02872, + "2485": 5.05906, + "2490": 5.05297, + "2495": 5.04377, + "2500": 5.04447, + "2505": 5.05124, + "2510": 5.0909, + "2515": 5.08005, + "2520": 5.02414, + "2525": 5.03617, + "2530": 5.05281, + "2535": 5.04127, + "2540": 5.04342, + "2545": 5.05498, + "2550": 4.99288, + "2555": 5.05988, + "2560": 5.03403, + "2565": 5.00279, + "2570": 5.02524, + "2575": 4.98811, + "2580": 5.00235, + "2585": 4.98259, + "2590": 5.00195, + "2595": 4.95577, + "2600": 4.99616, + "2605": 5.01565, + "2610": 5.00846, + "2615": 4.9779, + "2620": 4.96, + "2625": 4.99167, + "2630": 4.92069, + "2635": 5.00179, + "2640": 5.00217, + "2645": 4.95857, + "2650": 4.98056, + "2655": 4.97276, + "2660": 4.91658, + "2665": 5.00931, + "2670": 4.95271, + "2675": 4.92627, + "2680": 4.95939, + "2685": 4.9606, + "2690": 4.92299, + "2695": 4.99925, + "2700": 4.90798, + "2705": 4.92161, + "2710": 4.9625, + "2715": 4.94083, + "2720": 4.97062, + "2725": 4.91977, + "2730": 4.9445, + "2735": 4.9369, + "2740": 4.92939, + "2745": 4.89678, + "2750": 4.93832, + "2755": 4.94144, + "2760": 4.94244, + "2765": 4.91315, + "2770": 4.95527, + "2775": 4.90029, + "2780": 4.93753, + "2785": 4.91159, + "2790": 4.93952, + "2795": 4.89812, + "2800": 4.84327, + "2805": 4.89103, + "2810": 4.88284, + "2815": 4.89434, + "2820": 4.93504, + "2825": 4.92479, + "2830": 4.90086, + "2835": 4.90451, + "2840": 4.89553, + "2845": 4.87238, + "2850": 4.90777, + "2855": 4.83628, + "2860": 4.89239, + "2865": 4.90134, + "2870": 4.89048, + "2875": 4.90822, + "2880": 4.82774, + "2885": 4.8758, + "2890": 4.84909, + "2895": 4.88906, + "2900": 4.84436, + "2905": 4.85096, + "2910": 4.84745, + "2915": 4.89554, + "2920": 4.87192, + "2925": 4.84408, + "2930": 4.83304, + "2935": 4.83856, + "2940": 4.8364, + "2945": 4.80087, + "2950": 4.79094, + "2955": 4.79257, + "2960": 4.81394, + "2965": 4.82244, + "2970": 4.83033, + "2975": 4.843, + "2980": 4.78708, + "2985": 4.83546, + "2990": 4.84632, + "2995": 4.79479, + "3000": 4.79957, + "3005": 4.7852, + "3010": 4.81747, + "3015": 4.77707, + "3020": 4.79613, + "3025": 4.80689, + "3030": 4.81521, + "3035": 4.81107, + "3040": 4.83014, + "3045": 4.81253, + "3050": 4.78854, + "3055": 4.79109, + "3060": 4.77291, + "3065": 4.80026, + "3070": 4.82011, + "3075": 4.75177, + "3080": 4.78059, + "3085": 4.7825, + "3090": 4.76596, + "3095": 4.80833, + "3100": 4.79656, + "3105": 4.77177, + "3110": 4.76085, + "3115": 4.71609, + "3120": 4.78235, + "3125": 4.74714, + "3130": 4.75497, + "3135": 4.75435, + "3140": 4.7318, + "3145": 4.71606, + "3150": 4.74842, + "3155": 4.78313, + "3160": 4.765, + "3165": 4.75911, + "3170": 4.7541, + "3175": 4.746, + "3180": 4.73371, + "3185": 4.70655, + "3190": 4.70906, + "3195": 4.70876, + "3200": 4.67795, + "3205": 4.72527, + "3210": 4.67973, + "3215": 4.71138, + "3220": 4.67941, + "3225": 4.71501, + "3230": 4.698, + "3235": 4.73415, + "3240": 4.68214, + "3245": 4.6954, + "3250": 4.64543, + "3255": 4.69551, + "3260": 4.67926, + "3265": 4.72582, + "3270": 4.70744, + "3275": 4.65457, + "3280": 4.68021, + "3285": 4.69583, + "3290": 4.66845, + "3295": 4.67202, + "3300": 4.66858, + "3305": 4.67172, + "3310": 4.66314, + "3315": 4.70829, + "3320": 4.64885, + "3325": 4.65812, + "3330": 4.64245, + "3335": 4.65293, + "3340": 4.62608, + "3345": 4.64548, + "3350": 4.65071, + "3355": 4.65765, + "3360": 4.64823, + "3365": 4.66194, + "3370": 4.63984, + "3375": 4.67722, + "3380": 4.61449, + "3385": 4.62869, + "3390": 4.60608, + "3395": 4.6967, + "3400": 4.64188, + "3405": 4.6721, + "3410": 4.60581, + "3415": 4.55337, + "3420": 4.61467, + "3425": 4.63228, + "3430": 4.66874, + "3435": 4.63419, + "3440": 4.65338, + "3445": 4.60093, + "3450": 4.59889, + "3455": 4.62429, + "3460": 4.58089, + "3465": 4.57689, + "3470": 4.59454, + "3475": 4.60079, + "3480": 4.59374, + "3485": 4.62356, + "3490": 4.60917, + "3495": 4.63221, + "3500": 4.59027, + "3505": 4.59844, + "3510": 4.59797, + "3515": 4.648, + "3520": 4.62554, + "3525": 4.57245, + "3530": 4.58587, + "3535": 4.58174, + "3540": 4.63653, + "3545": 4.56212, + "3550": 4.62056, + "3555": 4.55332, + "3560": 4.62414, + "3565": 4.55473, + "3570": 4.56696, + "3575": 4.53468, + "3580": 4.59878, + "3585": 4.58068, + "3590": 4.51872, + "3595": 4.58848, + "3600": 4.55395, + "3605": 4.53571, + "3610": 4.54008, + "3615": 4.56874, + "3620": 4.61691, + "3625": 4.55023, + "3630": 4.59867, + "3635": 4.50879, + "3640": 4.52782, + "3645": 4.56947, + "3650": 4.53552, + "3655": 4.54665, + "3660": 4.55228, + "3665": 4.58643, + "3670": 4.54047, + "3675": 4.55594, + "3680": 4.57348, + "3685": 4.49418, + "3690": 4.54299, + "3695": 4.49297, + "3700": 4.52866, + "3705": 4.50654, + "3710": 4.51966, + "3715": 4.53, + "3720": 4.50118, + "3725": 4.47886, + "3730": 4.4879, + "3735": 4.50546, + "3740": 4.49399, + "3745": 4.48041, + "3750": 4.51288, + "3755": 4.48915, + "3760": 4.50004, + "3765": 4.47669, + "3770": 4.48984, + "3775": 4.46969, + "3780": 4.45476, + "3785": 4.50898, + "3790": 4.42336, + "3795": 4.4846, + "3800": 4.46028, + "3805": 4.46023, + "3810": 4.42629, + "3815": 4.4806, + "3820": 4.4736, + "3825": 4.4803, + "3830": 4.46747, + "3835": 4.42638, + "3840": 4.52349, + "3845": 4.48225, + "3850": 4.42266, + "3855": 4.46223, + "3860": 4.48001, + "3865": 4.44144, + "3870": 4.50523, + "3875": 4.41439, + "3880": 4.42672, + "3885": 4.44983, + "3890": 4.43819, + "3895": 4.38007, + "3900": 4.43434, + "3905": 4.41283, + "3910": 4.42081, + "3915": 4.42082, + "3920": 4.41329, + "3925": 4.39336, + "3930": 4.41243, + "3935": 4.41903, + "3940": 4.41848, + "3945": 4.39397, + "3950": 4.46098, + "3955": 4.39087, + "3960": 4.43851, + "3965": 4.44901, + "3970": 4.39272, + "3975": 4.40242, + "3980": 4.37236, + "3985": 4.40832, + "3990": 4.40208, + "3995": 4.44335, + "4000": 4.38322, + "4005": 4.37255, + "4010": 4.40982, + "4015": 4.39813, + "4020": 4.43488, + "4025": 4.39111, + "4030": 4.44761, + "4035": 4.40548, + "4040": 4.43553, + "4045": 4.41155, + "4050": 4.40643, + "4055": 4.41393, + "4060": 4.40665, + "4065": 4.41291, + "4070": 4.34904, + "4075": 4.37708, + "4080": 4.35797, + "4085": 4.39736, + "4090": 4.37437, + "4095": 4.35826, + "4100": 4.37323, + "4105": 4.36208, + "4110": 4.32609, + "4115": 4.39421, + "4120": 4.31057, + "4125": 4.31168, + "4130": 4.39302, + "4135": 4.37289, + "4140": 4.31616, + "4145": 4.32788, + "4150": 4.37558, + "4155": 4.29766, + "4160": 4.35633, + "4165": 4.38157, + "4170": 4.32646, + "4175": 4.33285, + "4180": 4.32735, + "4185": 4.31953, + "4190": 4.31017, + "4195": 4.31525, + "4200": 4.31406, + "4205": 4.37, + "4210": 4.32695, + "4215": 4.3562, + "4220": 4.33701, + "4225": 4.32036, + "4230": 4.30579, + "4235": 4.35051, + "4240": 4.30872, + "4245": 4.31564, + "4250": 4.29999, + "4255": 4.31166, + "4260": 4.29019, + "4265": 4.30554, + "4270": 4.29954, + "4275": 4.36276, + "4280": 4.29798, + "4285": 4.33284, + "4290": 4.27741, + "4295": 4.30368, + "4300": 4.32594, + "4305": 4.29066, + "4310": 4.33408, + "4315": 4.3163, + "4320": 4.30571, + "4325": 4.32764, + "4330": 4.26525, + "4335": 4.30418, + "4340": 4.28838, + "4345": 4.23753, + "4350": 4.25927, + "4355": 4.33009, + "4360": 4.30543, + "4365": 4.30411, + "4370": 4.28149, + "4375": 4.24372, + "4380": 4.25559, + "4385": 4.23331, + "4390": 4.30895, + "4395": 4.27518, + "4400": 4.26254, + "4405": 4.23007, + "4410": 4.28048, + "4415": 4.26816, + "4420": 4.24916, + "4425": 4.29252, + "4430": 4.24244, + "4435": 4.29049, + "4440": 4.28601, + "4445": 4.24232, + "4450": 4.20719, + "4455": 4.26016, + "4460": 4.23459, + "4465": 4.25243, + "4470": 4.23841, + "4475": 4.2641, + "4480": 4.24909, + "4485": 4.23389, + "4490": 4.23593, + "4495": 4.17962, + "4500": 4.25444, + "4505": 4.22942, + "4510": 4.23965, + "4515": 4.19566, + "4520": 4.23113, + "4525": 4.19456, + "4530": 4.24001, + "4535": 4.20166, + "4540": 4.21127, + "4545": 4.23188, + "4550": 4.27088, + "4555": 4.2072, + "4560": 4.22378, + "4565": 4.15426, + "4570": 4.21606, + "4575": 4.1941, + "4580": 4.25747, + "4585": 4.22428, + "4590": 4.21266, + "4595": 4.17399, + "4600": 4.16313, + "4605": 4.2045, + "4610": 4.19939, + "4615": 4.24443, + "4620": 4.16447, + "4625": 4.19099, + "4630": 4.20991, + "4635": 4.18208, + "4640": 4.21078, + "4645": 4.20652, + "4650": 4.22758, + "4655": 4.19246, + "4660": 4.18248, + "4665": 4.193, + "4670": 4.23574, + "4675": 4.17989, + "4680": 4.20859, + "4685": 4.19688, + "4690": 4.1723, + "4695": 4.18485, + "4700": 4.16546, + "4705": 4.14067, + "4710": 4.20305, + "4715": 4.19002, + "4720": 4.14737, + "4725": 4.12216, + "4730": 4.17809, + "4735": 4.10178, + "4740": 4.14697, + "4745": 4.18779, + "4750": 4.13615, + "4755": 4.19424, + "4760": 4.1984, + "4765": 4.1461, + "4770": 4.14849, + "4775": 4.14773, + "4780": 4.15523, + "4785": 4.13664, + "4790": 4.19224, + "4795": 4.17628, + "4800": 4.13942, + "4805": 4.17839, + "4810": 4.1375, + "4815": 4.17167, + "4820": 4.12226, + "4825": 4.17474, + "4830": 4.16985, + "4835": 4.14976, + "4840": 4.15298, + "4845": 4.10968, + "4850": 4.17354, + "4855": 4.17639, + "4860": 4.11236, + "4865": 4.13759, + "4870": 4.13215, + "4875": 4.17643, + "4880": 4.1702, + "4885": 4.13029, + "4890": 4.1249, + "4895": 4.12403, + "4900": 4.09958, + "4905": 4.09173, + "4910": 4.09074, + "4915": 4.14665, + "4920": 4.12021, + "4925": 4.08814, + "4930": 4.09778, + "4935": 4.12094, + "4940": 4.04981, + "4945": 4.13369, + "4950": 4.07708, + "4955": 4.15684, + "4960": 4.11652, + "4965": 4.1151, + "4970": 4.09971, + "4975": 4.11736, + "4980": 4.12585, + "4985": 4.12754, + "4990": 4.09005, + "4995": 4.12916, + "5000": 4.05682, + "5005": 4.11701, + "5010": 4.10942, + "5015": 4.07584, + "5020": 4.05201, + "5025": 4.06082, + "5030": 4.10005, + "5035": 4.08177, + "5040": 4.0418, + "5045": 4.11064, + "5050": 4.06425, + "5055": 4.08995, + "5060": 4.03143, + "5065": 4.09666, + "5070": 4.07056, + "5075": 4.12386, + "5080": 4.07795, + "5085": 4.09595, + "5090": 4.07748, + "5095": 4.0424, + "5100": 4.0782, + "5105": 4.0809, + "5110": 4.08612, + "5115": 4.07663, + "5120": 4.09438, + "5125": 4.05976, + "5130": 4.06327, + "5135": 4.0488, + "5140": 4.06922, + "5145": 4.05942, + "5150": 4.07092, + "5155": 4.07553, + "5160": 4.05549, + "5165": 4.09766, + "5170": 3.96642, + "5175": 4.07515, + "5180": 4.03531, + "5185": 4.05861, + "5190": 4.08092, + "5195": 4.04601, + "5200": 4.06577, + "5205": 4.09747, + "5210": 4.01055, + "5215": 4.02373, + "5220": 4.02621, + "5225": 4.02349, + "5230": 4.06271, + "5235": 4.03585, + "5240": 4.02422, + "5245": 4.04177, + "5250": 4.04544, + "5255": 4.03173, + "5260": 4.04798, + "5265": 4.01495, + "5270": 3.98673, + "5275": 4.00519, + "5280": 4.02024, + "5285": 4.04277, + "5290": 4.00304, + "5295": 4.00093, + "5300": 4.02323, + "5305": 4.01012, + "5310": 4.0478, + "5315": 3.99571, + "5320": 4.03864, + "5325": 4.06497, + "5330": 3.99981, + "5335": 4.02122, + "5340": 3.9739, + "5345": 4.01424, + "5350": 4.0246, + "5355": 4.01714, + "5360": 3.9668, + "5365": 3.98455, + "5370": 4.02892, + "5375": 3.99384, + "5380": 3.98952, + "5385": 4.00787, + "5390": 3.99585, + "5395": 3.932, + "5400": 4.02192, + "5405": 3.94401, + "5410": 4.03103, + "5415": 3.94954, + "5420": 3.98108, + "5425": 3.96619, + "5430": 3.97462, + "5435": 4.00917, + "5440": 3.96082, + "5445": 3.96843, + "5450": 3.98078, + "5455": 3.96312, + "5460": 3.97781, + "5465": 4.03343, + "5470": 3.99301, + "5475": 3.92634, + "5480": 4.0001, + "5485": 3.96789, + "5490": 3.99381, + "5495": 3.99755, + "5500": 3.95394, + "5505": 3.9702, + "5510": 4.00139, + "5515": 3.97886, + "5520": 3.95723, + "5525": 4.01089, + "5530": 3.95723, + "5535": 3.99058, + "5540": 3.95888, + "5545": 3.97704, + "5550": 3.97005, + "5555": 3.93134, + "5560": 3.94203, + "5565": 3.98688, + "5570": 3.94409, + "5575": 3.97691, + "5580": 3.95423, + "5585": 3.89232, + "5590": 3.96662, + "5595": 3.91996, + "5600": 3.97099, + "5605": 3.87423, + "5610": 3.96509, + "5615": 3.9629, + "5620": 3.97882, + "5625": 3.95843, + "5630": 3.94884, + "5635": 3.92989, + "5640": 3.95308, + "5645": 3.91537, + "5650": 3.88759, + "5655": 3.91914, + "5660": 3.9101, + "5665": 3.92739, + "5670": 3.91107, + "5675": 3.94487, + "5680": 3.91238, + "5685": 3.92365, + "5690": 3.92517, + "5695": 3.953, + "5700": 3.88996, + "5705": 3.88995, + "5710": 3.87532, + "5715": 3.99623, + "5720": 3.94505, + "5725": 3.89527, + "5730": 3.94792, + "5735": 3.92817, + "5740": 3.92171, + "5745": 3.89897, + "5750": 3.92176, + "5755": 3.94672, + "5760": 3.92632, + "5765": 3.92024, + "5770": 3.95286, + "5775": 3.86965, + "5780": 3.91041, + "5785": 3.91605, + "5790": 3.9236, + "5795": 3.93068, + "5800": 3.86954, + "5805": 3.8764, + "5810": 3.92692, + "5815": 3.89083, + "5820": 3.84021, + "5825": 3.89285, + "5830": 3.85163, + "5835": 3.88292, + "5840": 3.89361, + "5845": 3.91293, + "5850": 3.90508, + "5855": 3.84956, + "5860": 3.87018, + "5865": 3.8979, + "5870": 3.85816, + "5875": 3.89604, + "5880": 3.88075, + "5885": 3.89965, + "5890": 3.90395, + "5895": 3.92339, + "5900": 3.85618, + "5905": 3.92033, + "5910": 3.88782, + "5915": 3.85158, + "5920": 3.88999, + "5925": 3.82174, + "5930": 3.88478, + "5935": 3.86887, + "5940": 3.89924, + "5945": 3.90324, + "5950": 3.88472, + "5955": 3.83758, + "5960": 3.91077, + "5965": 3.85295, + "5970": 3.90592, + "5975": 3.87131, + "5980": 3.94635, + "5985": 3.81828, + "5990": 3.91445, + "5995": 3.82666, + "6000": 3.86389, + "6005": 3.82737, + "6010": 3.84638, + "6015": 3.82528, + "6020": 3.84213, + "6025": 3.8812, + "6030": 3.82864, + "6035": 3.87549, + "6040": 3.85371, + "6045": 3.88892, + "6050": 3.86125, + "6055": 3.84398, + "6060": 3.86538, + "6065": 3.8955, + "6070": 3.844, + "6075": 3.79156, + "6080": 3.86497, + "6085": 3.82767, + "6090": 3.86054, + "6095": 3.85995, + "6100": 3.82399, + "6105": 3.87238, + "6110": 3.80525, + "6115": 3.87931, + "6120": 3.85374, + "6125": 3.85469, + "6130": 3.85122, + "6135": 3.82709, + "6140": 3.8225, + "6145": 3.81264, + "6150": 3.85853, + "6155": 3.83605, + "6160": 3.80232, + "6165": 3.82292, + "6170": 3.81513, + "6175": 3.80691, + "6180": 3.8071, + "6185": 3.84448, + "6190": 3.81178, + "6195": 3.78014, + "6200": 3.80543, + "6205": 3.81219, + "6210": 3.77002, + "6215": 3.82559, + "6220": 3.822, + "6225": 3.82598, + "6230": 3.76955, + "6235": 3.8072, + "6240": 3.73374, + "6245": 3.84624, + "6250": 3.80845, + "6255": 3.8223, + "6260": 3.7948, + "6265": 3.82819, + "6270": 3.75673, + "6275": 3.78492, + "6280": 3.80313, + "6285": 3.78154, + "6290": 3.79976, + "6295": 3.80168, + "6300": 3.80756, + "6305": 3.88253, + "6310": 3.7702, + "6315": 3.7633, + "6320": 3.81817, + "6325": 3.75526, + "6330": 3.82862, + "6335": 3.81943, + "6340": 3.76721, + "6345": 3.82391, + "6350": 3.76718, + "6355": 3.77414, + "6360": 3.75111, + "6365": 3.80986, + "6370": 3.81014, + "6375": 3.78548, + "6380": 3.8065, + "6385": 3.82336, + "6390": 3.78289, + "6395": 3.75935, + "6400": 3.76038, + "6405": 3.83749, + "6410": 3.83127, + "6415": 3.7623, + "6420": 3.82306, + "6425": 3.83219, + "6430": 3.81048, + "6435": 3.77764, + "6440": 3.76108, + "6445": 3.80173, + "6450": 3.73884, + "6455": 3.75156, + "6460": 3.77352, + "6465": 3.80905, + "6470": 3.78701, + "6475": 3.78176, + "6480": 3.81548, + "6485": 3.76414, + "6490": 3.71291, + "6495": 3.81407, + "6500": 3.79809, + "6505": 3.72741, + "6510": 3.7976, + "6515": 3.81938, + "6520": 3.73166, + "6525": 3.80464, + "6530": 3.76853, + "6535": 3.76159, + "6540": 3.82675, + "6545": 3.76261, + "6550": 3.76963, + "6555": 3.75505, + "6560": 3.71108, + "6565": 3.70887, + "6570": 3.7465, + "6575": 3.69338, + "6580": 3.81517, + "6585": 3.76239, + "6590": 3.72546, + "6595": 3.74461, + "6600": 3.73687, + "6605": 3.71668, + "6610": 3.72679, + "6615": 3.76079, + "6620": 3.70966, + "6625": 3.72313, + "6630": 3.72114, + "6635": 3.76232, + "6640": 3.73374, + "6645": 3.75061, + "6650": 3.77922, + "6655": 3.70627, + "6660": 3.73531, + "6665": 3.7573, + "6670": 3.71979, + "6675": 3.74124, + "6680": 3.73477, + "6685": 3.76436, + "6690": 3.74256, + "6695": 3.75545, + "6700": 3.74559, + "6705": 3.72882, + "6710": 3.72913, + "6715": 3.69291, + "6720": 3.77736, + "6725": 3.75737, + "6730": 3.73993, + "6735": 3.74082, + "6740": 3.73806, + "6745": 3.72041, + "6750": 3.74412, + "6755": 3.69337, + "6760": 3.68122, + "6765": 3.74232, + "6770": 3.69625, + "6775": 3.74604, + "6780": 3.70485, + "6785": 3.70942, + "6790": 3.73683, + "6795": 3.69846, + "6800": 3.71752, + "6805": 3.72172, + "6810": 3.73628, + "6815": 3.65876, + "6820": 3.70229, + "6825": 3.72745, + "6830": 3.70872, + "6835": 3.68623, + "6840": 3.67517, + "6845": 3.74818, + "6850": 3.70405, + "6855": 3.73713, + "6860": 3.6695, + "6865": 3.73585, + "6870": 3.6953, + "6875": 3.69781, + "6880": 3.70324, + "6885": 3.67727, + "6890": 3.69236, + "6895": 3.67848, + "6900": 3.68133, + "6905": 3.68771, + "6910": 3.72919, + "6915": 3.73359, + "6920": 3.68934, + "6925": 3.69022, + "6930": 3.68858, + "6935": 3.62056, + "6940": 3.68927, + "6945": 3.67777, + "6950": 3.68038, + "6955": 3.6771, + "6960": 3.68108, + "6965": 3.72225, + "6970": 3.64603, + "6975": 3.72781, + "6980": 3.68459, + "6985": 3.68985, + "6990": 3.7316, + "6995": 3.70495, + "7000": 3.63993, + "7005": 3.71744, + "7010": 3.69223, + "7015": 3.67561, + "7020": 3.72152, + "7025": 3.70969, + "7030": 3.70236, + "7035": 3.65723, + "7040": 3.61488, + "7045": 3.69518, + "7050": 3.71947, + "7055": 3.64991, + "7060": 3.69149, + "7065": 3.74261, + "7070": 3.67108, + "7075": 3.67419, + "7080": 3.71683, + "7085": 3.64191, + "7090": 3.66318, + "7095": 3.63818, + "7100": 3.68341, + "7105": 3.62024, + "7110": 3.68873, + "7115": 3.63797, + "7120": 3.68741, + "7125": 3.63499, + "7130": 3.65311, + "7135": 3.66196, + "7140": 3.66504, + "7145": 3.68183, + "7150": 3.62677, + "7155": 3.69052, + "7160": 3.62415, + "7165": 3.64241, + "7170": 3.68231, + "7175": 3.64603, + "7180": 3.67571, + "7185": 3.70721, + "7190": 3.663, + "7195": 3.66862, + "7200": 3.67265, + "7205": 3.65833, + "7210": 3.68834, + "7215": 3.67282, + "7220": 3.69117, + "7225": 3.66107, + "7230": 3.68593, + "7235": 3.64823, + "7240": 3.64663, + "7245": 3.66574, + "7250": 3.60447, + "7255": 3.62598, + "7260": 3.68023, + "7265": 3.60288, + "7270": 3.63936, + "7275": 3.64805, + "7280": 3.62623, + "7285": 3.65053, + "7290": 3.6735, + "7295": 3.66357, + "7300": 3.62393, + "7305": 3.62784, + "7310": 3.66312, + "7315": 3.67632, + "7320": 3.65015, + "7325": 3.65453, + "7330": 3.62344, + "7335": 3.62574, + "7340": 3.64422, + "7345": 3.60533, + "7350": 3.65727, + "7355": 3.64352, + "7360": 3.61779, + "7365": 3.63578, + "7370": 3.6188, + "7375": 3.59366, + "7380": 3.64743, + "7385": 3.67218, + "7390": 3.65876, + "7395": 3.60688, + "7400": 3.65695, + "7405": 3.64945, + "7410": 3.66151, + "7415": 3.64439, + "7420": 3.63591, + "7425": 3.6844, + "7430": 3.63181, + "7435": 3.61154, + "7440": 3.62564, + "7445": 3.60843, + "7450": 3.57301, + "7455": 3.64772, + "7460": 3.63452, + "7465": 3.63169, + "7470": 3.63744, + "7475": 3.64264, + "7480": 3.61171, + "7485": 3.57567, + "7490": 3.57599, + "7495": 3.5863, + "7500": 3.61565, + "7505": 3.59614, + "7510": 3.55707, + "7515": 3.61683, + "7520": 3.60991, + "7525": 3.56658, + "7530": 3.61196, + "7535": 3.62507, + "7540": 3.61046, + "7545": 3.64639, + "7550": 3.65882, + "7555": 3.58595, + "7560": 3.60212, + "7565": 3.59782, + "7570": 3.60603, + "7575": 3.57351, + "7580": 3.62111, + "7585": 3.60137, + "7590": 3.6026, + "7595": 3.66318, + "7600": 3.6076, + "7605": 3.59626, + "7610": 3.58483, + "7615": 3.58478, + "7620": 3.56787, + "7625": 3.62193, + "7630": 3.60469, + "7635": 3.5928, + "7640": 3.59019, + "7645": 3.62279, + "7650": 3.6259, + "7655": 3.66371, + "7660": 3.5305, + "7665": 3.60545, + "7670": 3.59796, + "7675": 3.58201, + "7680": 3.57701, + "7685": 3.64556, + "7690": 3.59102, + "7695": 3.57063, + "7700": 3.63352, + "7705": 3.58816, + "7710": 3.62048, + "7715": 3.5764, + "7720": 3.65561, + "7725": 3.55706, + "7730": 3.57614, + "7735": 3.61006, + "7740": 3.58168, + "7745": 3.58454, + "7750": 3.57422, + "7755": 3.59202, + "7760": 3.56089, + "7765": 3.58551, + "7770": 3.60104, + "7775": 3.57103, + "7780": 3.55457, + "7785": 3.57713, + "7790": 3.57042, + "7795": 3.58792, + "7800": 3.57997, + "7805": 3.58361, + "7810": 3.60683, + "7815": 3.57773, + "7820": 3.57578, + "7825": 3.61835, + "7830": 3.59192, + "7835": 3.52632, + "7840": 3.6194, + "7845": 3.55538, + "7850": 3.51354, + "7855": 3.56599, + "7860": 3.54645, + "7865": 3.60369, + "7870": 3.54114, + "7875": 3.55695, + "7880": 3.572, + "7885": 3.56229, + "7890": 3.60585, + "7895": 3.59334, + "7900": 3.60641, + "7905": 3.56339, + "7910": 3.58203, + "7915": 3.58298, + "7920": 3.59012, + "7925": 3.5681, + "7930": 3.59927, + "7935": 3.56169, + "7940": 3.60948, + "7945": 3.62723, + "7950": 3.53708, + "7955": 3.54481, + "7960": 3.53124, + "7965": 3.51862, + "7970": 3.52486, + "7975": 3.55975, + "7980": 3.56722, + "7985": 3.54114, + "7990": 3.54399, + "7995": 3.5186, + "8000": 3.57756, + "8005": 3.54643, + "8010": 3.53705, + "8015": 3.53445, + "8020": 3.53111, + "8025": 3.51514, + "8030": 3.54148, + "8035": 3.53478, + "8040": 3.52163, + "8045": 3.57586, + "8050": 3.57789, + "8055": 3.54866, + "8060": 3.5712, + "8065": 3.54757, + "8070": 3.53654, + "8075": 3.52629, + "8080": 3.57467, + "8085": 3.52928, + "8090": 3.53424, + "8095": 3.56313, + "8100": 3.51543, + "8105": 3.54752, + "8110": 3.5453, + "8115": 3.51645, + "8120": 3.52703, + "8125": 3.56437, + "8130": 3.52567, + "8135": 3.53994, + "8140": 3.52104, + "8145": 3.50389, + "8150": 3.52394, + "8155": 3.51178, + "8160": 3.56129, + "8165": 3.54328, + "8170": 3.5116, + "8175": 3.5057, + "8180": 3.57245, + "8185": 3.54733, + "8190": 3.58207, + "8195": 3.55001, + "8200": 3.52156, + "8205": 3.52888, + "8210": 3.53558, + "8215": 3.55713, + "8220": 3.5201, + "8225": 3.51201, + "8230": 3.53756, + "8235": 3.55814, + "8240": 3.54052, + "8245": 3.53652, + "8250": 3.5692, + "8255": 3.51844, + "8260": 3.52912, + "8265": 3.52072, + "8270": 3.52843, + "8275": 3.51526, + "8280": 3.50321, + "8285": 3.52669, + "8290": 3.5272, + "8295": 3.49645, + "8300": 3.51721, + "8305": 3.53958, + "8310": 3.5351, + "8315": 3.50396, + "8320": 3.53046, + "8325": 3.47885, + "8330": 3.44388, + "8335": 3.51457, + "8340": 3.54076, + "8345": 3.49873, + "8350": 3.51134, + "8355": 3.54342, + "8360": 3.51607, + "8365": 3.53716, + "8370": 3.53127, + "8375": 3.48696, + "8380": 3.4848, + "8385": 3.52879, + "8390": 3.49474, + "8395": 3.52721, + "8400": 3.49636, + "8405": 3.51685, + "8410": 3.57651, + "8415": 3.48228, + "8420": 3.45216, + "8425": 3.53401, + "8430": 3.53787, + "8435": 3.47534, + "8440": 3.55163, + "8445": 3.53658, + "8450": 3.50995, + "8455": 3.52875, + "8460": 3.53463, + "8465": 3.4708, + "8470": 3.4929, + "8475": 3.55004, + "8480": 3.47555, + "8485": 3.49487, + "8490": 3.48489, + "8495": 3.48023, + "8500": 3.52888, + "8505": 3.46749, + "8510": 3.54064, + "8515": 3.48982, + "8520": 3.49184, + "8525": 3.42254, + "8530": 3.50181, + "8535": 3.52351, + "8540": 3.47484, + "8545": 3.49944, + "8550": 3.46881, + "8555": 3.53517, + "8560": 3.5346, + "8565": 3.48792, + "8570": 3.48883, + "8575": 3.46414, + "8580": 3.50837, + "8585": 3.52994, + "8590": 3.51956, + "8595": 3.52409, + "8600": 3.50319, + "8605": 3.49079, + "8610": 3.49584, + "8615": 3.49483, + "8620": 3.46525, + "8625": 3.4875, + "8630": 3.49269, + "8635": 3.47742, + "8640": 3.46288, + "8645": 3.52844, + "8650": 3.45936, + "8655": 3.50294, + "8660": 3.51093, + "8665": 3.48996, + "8670": 3.50547, + "8675": 3.47414, + "8680": 3.4685, + "8685": 3.48029, + "8690": 3.51264, + "8695": 3.51367, + "8700": 3.48324, + "8705": 3.45351, + "8710": 3.50031, + "8715": 3.45042, + "8720": 3.52876, + "8725": 3.48819, + "8730": 3.47981, + "8735": 3.51018, + "8740": 3.46013, + "8745": 3.50108, + "8750": 3.50543, + "8755": 3.46564, + "8760": 3.48373, + "8765": 3.43955, + "8770": 3.50951, + "8775": 3.47313, + "8780": 3.45782, + "8785": 3.47628, + "8790": 3.4608, + "8795": 3.49675, + "8800": 3.46402, + "8805": 3.43267, + "8810": 3.45044, + "8815": 3.47281, + "8820": 3.43586, + "8825": 3.46906, + "8830": 3.44494, + "8835": 3.42402, + "8840": 3.4361, + "8845": 3.45772, + "8850": 3.48143, + "8855": 3.46505, + "8860": 3.53187, + "8865": 3.46882, + "8870": 3.44869, + "8875": 3.45286, + "8880": 3.45584, + "8885": 3.44986, + "8890": 3.47298, + "8895": 3.45131, + "8900": 3.47879, + "8905": 3.46796, + "8910": 3.45421, + "8915": 3.44293, + "8920": 3.43345, + "8925": 3.50917, + "8930": 3.49052, + "8935": 3.50073, + "8940": 3.47584, + "8945": 3.47848, + "8950": 3.45717, + "8955": 3.44615, + "8960": 3.43965, + "8965": 3.45818, + "8970": 3.47179, + "8975": 3.42177, + "8980": 3.42266, + "8985": 3.44671, + "8990": 3.50075, + "8995": 3.47255, + "9000": 3.41954, + "9005": 3.46563, + "9010": 3.51573, + "9015": 3.4185, + "9020": 3.43896, + "9025": 3.44768, + "9030": 3.4718, + "9035": 3.37943, + "9040": 3.45501, + "9045": 3.45466, + "9050": 3.49179, + "9055": 3.40312, + "9060": 3.49477, + "9065": 3.51349, + "9070": 3.44713, + "9075": 3.47746, + "9080": 3.47127, + "9085": 3.47459, + "9090": 3.46668, + "9095": 3.42167, + "9100": 3.4227, + "9105": 3.41261, + "9110": 3.45663, + "9115": 3.46481, + "9120": 3.51949, + "9125": 3.44245, + "9130": 3.43654, + "9135": 3.46008, + "9140": 3.47929, + "9145": 3.42408, + "9150": 3.44307, + "9155": 3.45089, + "9160": 3.44998, + "9165": 3.45651, + "9170": 3.47508, + "9175": 3.41133, + "9180": 3.45323, + "9185": 3.41086, + "9190": 3.46875, + "9195": 3.43315, + "9200": 3.44758, + "9205": 3.42373, + "9210": 3.45572, + "9215": 3.39585, + "9220": 3.42327, + "9225": 3.44665, + "9230": 3.37357, + "9235": 3.39456, + "9240": 3.42282, + "9245": 3.40683, + "9250": 3.40791, + "9255": 3.42077, + "9260": 3.39755, + "9265": 3.44216, + "9270": 3.40754, + "9275": 3.42864, + "9280": 3.44334, + "9285": 3.44087, + "9290": 3.45563, + "9295": 3.44456, + "9300": 3.39522, + "9305": 3.42638, + "9310": 3.41593, + "9315": 3.38278, + "9320": 3.3797, + "9325": 3.42046, + "9330": 3.47853, + "9335": 3.38962, + "9340": 3.4706, + "9345": 3.46224, + "9350": 3.42735, + "9355": 3.39326, + "9360": 3.4165, + "9365": 3.41212, + "9370": 3.46155, + "9375": 3.42622, + "9380": 3.36413, + "9385": 3.43469, + "9390": 3.44403, + "9395": 3.45465, + "9400": 3.41582, + "9405": 3.40031, + "9410": 3.43744, + "9415": 3.42574, + "9420": 3.40295, + "9425": 3.42063, + "9430": 3.3935, + "9435": 3.41529, + "9440": 3.40125, + "9445": 3.39961, + "9450": 3.39469, + "9455": 3.4008, + "9460": 3.46489, + "9465": 3.46303, + "9470": 3.40478, + "9475": 3.45335, + "9480": 3.40789, + "9485": 3.3998, + "9490": 3.41154, + "9495": 3.44387, + "9500": 3.40535, + "9505": 3.37735, + "9510": 3.41645, + "9515": 3.41113, + "9520": 3.43045, + "9525": 3.40102, + "9530": 3.40027, + "9535": 3.42216 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 9535, + "step_interval": 5, + "values": { + "1": 241.22832, + "5": 11.6467, + "10": 11.59177, + "15": 11.54982, + "20": 11.50554, + "25": 11.48401, + "30": 11.47019, + "35": 11.4638, + "40": 11.44621, + "45": 11.45505, + "50": 11.48551, + "55": 11.47505, + "60": 11.46559, + "65": 11.69276, + "70": 11.51491, + "75": 11.58841, + "80": 11.59402, + "85": 11.55505, + "90": 11.57827, + "95": 11.6084, + "100": 11.72328, + "105": 11.84735, + "110": 11.81445, + "115": 12.01469, + "120": 12.27052, + "125": 12.40894, + "130": 12.32306, + "135": 12.6537, + "140": 12.87941, + "145": 12.87274, + "150": 13.17646, + "155": 13.42132, + "160": 13.29203, + "165": 13.33468, + "170": 13.38365, + "175": 13.29143, + "180": 13.37704, + "185": 13.17491, + "190": 13.2207, + "195": 13.0407, + "200": 13.03378, + "205": 12.93499, + "210": 12.93302, + "215": 12.83429, + "220": 12.77504, + "225": 12.71437, + "230": 12.67462, + "235": 12.7241, + "240": 12.78341, + "245": 12.61372, + "250": 12.60968, + "255": 12.49502, + "260": 12.38655, + "265": 12.35372, + "270": 12.32939, + "275": 12.25213, + "280": 12.23412, + "285": 12.25047, + "290": 12.1386, + "295": 12.11066, + "300": 12.11487, + "305": 12.08746, + "310": 12.06842, + "315": 12.13334, + "320": 12.12044, + "325": 12.01351, + "330": 11.97276, + "335": 11.951, + "340": 11.97582, + "345": 11.94178, + "350": 11.90942, + "355": 11.9474, + "360": 11.94231, + "365": 11.91539, + "370": 11.89051, + "375": 11.87871, + "380": 11.8539, + "385": 11.81422, + "390": 11.82072, + "395": 11.85516, + "400": 11.8322, + "405": 11.81286, + "410": 11.81008, + "415": 11.76854, + "420": 11.7721, + "425": 11.7287, + "430": 11.80281, + "435": 11.76948, + "440": 11.78237, + "445": 11.81223, + "450": 11.76024, + "455": 11.83905, + "460": 11.86797, + "465": 11.88193, + "470": 11.94544, + "475": 12.03403, + "480": 11.8718, + "485": 11.96463, + "490": 11.9543, + "495": 11.99738, + "500": 12.06608, + "505": 12.04813, + "510": 12.09706, + "515": 12.14335, + "520": 12.36581, + "525": 12.19115, + "530": 12.1887, + "535": 12.25354, + "540": 12.27902, + "545": 12.32347, + "550": 12.44366, + "555": 12.25807, + "560": 12.22369, + "565": 12.28956, + "570": 12.31572, + "575": 12.28835, + "580": 12.33571, + "585": 12.26567, + "590": 12.30079, + "595": 12.29151, + "600": 12.30023, + "605": 12.45501, + "610": 12.27373, + "615": 12.217, + "620": 12.22334, + "625": 12.21274, + "630": 12.21904, + "635": 12.20277, + "640": 12.25538, + "645": 12.19988, + "650": 12.14026, + "655": 12.14302, + "660": 12.14678, + "665": 12.13972, + "670": 12.11485, + "675": 12.0282, + "680": 12.01901, + "685": 11.98462, + "690": 11.98742, + "695": 11.95917, + "700": 11.92521, + "705": 18.38779, + "710": 11.92438, + "715": 11.8274, + "720": 11.90138, + "725": 11.84998, + "730": 11.83009, + "735": 11.89248, + "740": 11.82364, + "745": 11.91839, + "750": 11.9577, + "755": 11.85056, + "760": 11.90523, + "765": 11.9116, + "770": 11.83717, + "775": 12.05864, + "780": 11.84895, + "785": 11.84375, + "790": 11.86493, + "795": 11.85763, + "800": 11.94365, + "805": 11.86899, + "810": 11.86748, + "815": 11.86393, + "820": 11.87992, + "825": 11.85259, + "830": 11.86886, + "835": 11.8517, + "840": 11.86254, + "845": 11.89508, + "850": 11.85613, + "855": 11.87434, + "860": 11.90703, + "865": 11.83224, + "870": 11.88246, + "875": 11.9305, + "880": 11.96022, + "885": 11.81651, + "890": 12.06642, + "895": 11.92653, + "900": 11.86469, + "905": 12.01767, + "910": 11.89635, + "915": 11.8254, + "920": 11.86106, + "925": 11.88434, + "930": 11.97059, + "935": 12.03718, + "940": 11.87698, + "945": 11.88008, + "950": 12.02071, + "955": 11.84843, + "960": 244.37245, + "965": 12.32084, + "970": 11.86341, + "975": 12.01988, + "980": 11.92166, + "985": 11.85411, + "990": 11.87753, + "995": 11.84786, + "1000": 11.89892, + "1005": 11.99759, + "1010": 11.91045, + "1015": 11.87038, + "1020": 11.85674, + "1025": 11.85567, + "1030": 11.86674, + "1035": 11.92499, + "1040": 11.85969, + "1045": 12.04929, + "1050": 11.82341, + "1055": 11.83111, + "1060": 11.87567, + "1065": 11.84584, + "1070": 11.93603, + "1075": 11.87121, + "1080": 11.85935, + "1085": 11.88667, + "1090": 11.86058, + "1095": 11.86482, + "1100": 11.82375, + "1105": 11.86482, + "1110": 11.89668, + "1115": 11.94941, + "1120": 11.84941, + "1125": 11.94466, + "1130": 11.90846, + "1135": 11.8602, + "1140": 11.86926, + "1145": 11.90365, + "1150": 11.88788, + "1155": 11.81781, + "1160": 11.88464, + "1165": 11.85344, + "1170": 11.8865, + "1175": 11.93361, + "1180": 11.89647, + "1185": 11.9031, + "1190": 11.89287, + "1195": 11.88683, + "1200": 11.85927, + "1205": 11.92471, + "1210": 11.85592, + "1215": 17.4276, + "1220": 11.87359, + "1225": 11.9296, + "1230": 11.95025, + "1235": 11.90738, + "1240": 11.86621, + "1245": 11.98001, + "1250": 12.003, + "1255": 11.91396, + "1260": 11.92279, + "1265": 11.85195, + "1270": 11.87463, + "1275": 11.90307, + "1280": 11.84637, + "1285": 11.95883, + "1290": 11.88039, + "1295": 11.8399, + "1300": 11.81976, + "1305": 11.89766, + "1310": 11.91584, + "1315": 12.12571, + "1320": 12.05556, + "1325": 11.84679, + "1330": 11.94985, + "1335": 11.94039, + "1340": 12.00572, + "1345": 11.98268, + "1350": 12.15927, + "1355": 12.04312, + "1360": 11.98816, + "1365": 11.95737, + "1370": 11.92395, + "1375": 11.89595, + "1380": 11.88635, + "1385": 11.96617, + "1390": 11.87421, + "1395": 12.02833, + "1400": 11.87415, + "1405": 11.85875, + "1410": 11.85419, + "1415": 11.8978, + "1420": 11.86309, + "1425": 11.87505, + "1430": 12.10339, + "1435": 11.88151, + "1440": 12.15068, + "1445": 11.98493, + "1450": 11.95438, + "1455": 12.03808, + "1460": 11.85293, + "1465": 11.93176, + "1470": 11.92246, + "1475": 11.90448, + "1480": 11.98959, + "1485": 11.93685, + "1490": 11.92389, + "1495": 11.95047, + "1500": 11.94526, + "1505": 11.9086, + "1510": 11.95225, + "1515": 11.87405, + "1520": 11.87975, + "1525": 11.88264, + "1530": 12.04989, + "1535": 12.02942, + "1540": 11.93089, + "1545": 11.89376, + "1550": 11.88596, + "1555": 11.95001, + "1560": 11.90239, + "1565": 11.89699, + "1570": 11.91441, + "1575": 11.87813, + "1580": 11.86939, + "1585": 11.8566, + "1590": 11.8665, + "1595": 11.90861, + "1600": 11.90425, + "1605": 11.82248, + "1610": 11.86531, + "1615": 11.8796, + "1620": 11.87587, + "1625": 11.88944, + "1630": 11.88839, + "1635": 11.8307, + "1640": 11.87082, + "1645": 11.84687, + "1650": 11.87887, + "1655": 11.85709, + "1660": 11.85167, + "1665": 11.90284, + "1670": 11.85205, + "1675": 12.00742, + "1680": 11.90754, + "1685": 11.97458, + "1690": 11.97016, + "1695": 11.9189, + "1700": 11.89709, + "1705": 11.88042, + "1710": 11.87879, + "1715": 12.06779, + "1720": 11.98631, + "1725": 12.01044, + "1730": 11.9924, + "1735": 11.87648, + "1740": 11.87455, + "1745": 11.93461, + "1750": 11.90235, + "1755": 11.97053, + "1760": 11.89545, + "1765": 11.8564, + "1770": 11.92635, + "1775": 11.91815, + "1780": 11.91235, + "1785": 11.85546, + "1790": 11.93087, + "1795": 11.91138, + "1800": 11.95901, + "1805": 12.0529, + "1810": 11.98858, + "1815": 12.13997, + "1820": 11.94798, + "1825": 11.97682, + "1830": 11.91244, + "1835": 11.94888, + "1840": 11.93666, + "1845": 11.87312, + "1850": 11.86327, + "1855": 11.94769, + "1860": 12.00187, + "1865": 12.06916, + "1870": 11.99528, + "1875": 11.89416, + "1880": 12.02292, + "1885": 12.04249, + "1890": 11.94094, + "1895": 11.93619, + "1900": 11.95301, + "1905": 11.85793, + "1910": 11.96264, + "1915": 11.92826, + "1920": 11.94216, + "1925": 12.01307, + "1930": 11.98891, + "1935": 11.95834, + "1940": 11.92143, + "1945": 11.98459, + "1950": 16.97099, + "1955": 11.89147, + "1960": 11.94643, + "1965": 11.92486, + "1970": 11.91542, + "1975": 13.09741, + "1980": 12.02148, + "1985": 11.92812, + "1990": 12.01102, + "1995": 11.94891, + "2000": 12.06741, + "2005": 11.94166, + "2010": 11.95871, + "2015": 12.00042, + "2020": 11.99101, + "2025": 11.95463, + "2030": 12.36755, + "2035": 11.96199, + "2040": 11.97863, + "2045": 12.01033, + "2050": 12.0643, + "2055": 11.96928, + "2060": 11.98383, + "2065": 11.92648, + "2070": 11.92379, + "2075": 11.97669, + "2080": 11.95508, + "2085": 11.94472, + "2090": 11.9663, + "2095": 11.93695, + "2100": 11.97178, + "2105": 11.98764, + "2110": 11.9516, + "2115": 11.9215, + "2120": 11.95207, + "2125": 11.95947, + "2130": 11.96722, + "2135": 11.97924, + "2140": 11.88777, + "2145": 11.95546, + "2150": 11.90266, + "2155": 11.97573, + "2160": 11.93275, + "2165": 11.98593, + "2170": 11.9842, + "2175": 12.00145, + "2180": 11.99219, + "2185": 11.96424, + "2190": 11.94313, + "2195": 11.93489, + "2200": 11.94356, + "2205": 12.00157, + "2210": 11.97153, + "2215": 11.9563, + "2220": 12.14117, + "2225": 11.97066, + "2230": 12.00037, + "2235": 11.95279, + "2240": 11.9544, + "2245": 11.97031, + "2250": 11.92229, + "2255": 11.98097, + "2260": 11.96529, + "2265": 11.98619, + "2270": 12.02117, + "2275": 11.94865, + "2280": 12.02569, + "2285": 11.98203, + "2290": 12.10479, + "2295": 11.95346, + "2300": 11.99961, + "2305": 11.96025, + "2310": 11.98746, + "2315": 11.95209, + "2320": 12.02644, + "2325": 11.95369, + "2330": 11.91985, + "2335": 11.93244, + "2340": 11.97061, + "2345": 11.90115, + "2350": 11.99136, + "2355": 12.0541, + "2360": 12.03728, + "2365": 11.95319, + "2370": 11.8917, + "2375": 11.94629, + "2380": 11.9087, + "2385": 11.91696, + "2390": 11.90123, + "2395": 11.87998, + "2400": 12.02954, + "2405": 11.97917, + "2410": 11.98456, + "2415": 11.9575, + "2420": 11.95917, + "2425": 11.95788, + "2430": 11.99944, + "2435": 12.00043, + "2440": 11.91339, + "2445": 11.97889, + "2450": 11.93997, + "2455": 11.91834, + "2460": 11.98321, + "2465": 11.94509, + "2470": 11.93387, + "2475": 11.9562, + "2480": 11.93148, + "2485": 11.94432, + "2490": 11.95477, + "2495": 11.94334, + "2500": 11.9284, + "2505": 11.93757, + "2510": 11.92289, + "2515": 11.97869, + "2520": 11.94858, + "2525": 11.96606, + "2530": 11.90894, + "2535": 11.95425, + "2540": 11.89136, + "2545": 11.94553, + "2550": 11.98026, + "2555": 11.93376, + "2560": 11.94866, + "2565": 11.92767, + "2570": 11.93583, + "2575": 11.97284, + "2580": 11.98911, + "2585": 11.95484, + "2590": 11.96399, + "2595": 11.96211, + "2600": 11.93906, + "2605": 11.9733, + "2610": 12.01872, + "2615": 11.99897, + "2620": 11.90926, + "2625": 11.93248, + "2630": 11.92842, + "2635": 11.94338, + "2640": 11.94678, + "2645": 11.95901, + "2650": 11.9296, + "2655": 12.02405, + "2660": 12.0166, + "2665": 12.01166, + "2670": 11.90595, + "2675": 11.98569, + "2680": 12.0118, + "2685": 11.92029, + "2690": 11.93111, + "2695": 12.00369, + "2700": 11.94818, + "2705": 11.99119, + "2710": 11.93978, + "2715": 11.9296, + "2720": 11.93044, + "2725": 11.94343, + "2730": 12.02248, + "2735": 11.95389, + "2740": 11.94611, + "2745": 11.92776, + "2750": 11.91647, + "2755": 11.9522, + "2760": 11.95012, + "2765": 11.96707, + "2770": 11.94892, + "2775": 11.9867, + "2780": 11.96897, + "2785": 11.97268, + "2790": 12.01936, + "2795": 11.97259, + "2800": 12.01028, + "2805": 11.94892, + "2810": 12.04828, + "2815": 11.93469, + "2820": 11.94568, + "2825": 11.92529, + "2830": 11.97458, + "2835": 11.99475, + "2840": 11.94984, + "2845": 11.93356, + "2850": 12.05796, + "2855": 11.99065, + "2860": 11.96077, + "2865": 11.9377, + "2870": 11.97627, + "2875": 11.97986, + "2880": 11.97201, + "2885": 11.91879, + "2890": 11.93586, + "2895": 12.00661, + "2900": 11.94616, + "2905": 11.94376, + "2910": 11.94168, + "2915": 11.94867, + "2920": 11.99355, + "2925": 11.94779, + "2930": 11.97133, + "2935": 11.96256, + "2940": 11.97787, + "2945": 11.93759, + "2950": 11.91863, + "2955": 11.98973, + "2960": 12.00486, + "2965": 11.91623, + "2970": 11.94846, + "2975": 11.91534, + "2980": 11.97787, + "2985": 12.385, + "2990": 11.88498, + "2995": 11.92173, + "3000": 11.90561, + "3005": 11.86795, + "3010": 11.88075, + "3015": 11.87833, + "3020": 11.98777, + "3025": 11.90078, + "3030": 11.98251, + "3035": 11.92211, + "3040": 11.91067, + "3045": 12.04371, + "3050": 11.91886, + "3055": 11.952, + "3060": 11.90649, + "3065": 11.86917, + "3070": 11.86601, + "3075": 11.92435, + "3080": 11.98092, + "3085": 11.94809, + "3090": 12.20304, + "3095": 11.87329, + "3100": 11.92696, + "3105": 11.85799, + "3110": 11.84125, + "3115": 11.82558, + "3120": 11.87566, + "3125": 11.89426, + "3130": 11.85869, + "3135": 11.92893, + "3140": 11.97022, + "3145": 11.84939, + "3150": 11.9785, + "3155": 11.92499, + "3160": 11.8889, + "3165": 11.87938, + "3170": 11.95555, + "3175": 11.91883, + "3180": 11.85842, + "3185": 11.9325, + "3190": 11.86061, + "3195": 11.90479, + "3200": 11.85963, + "3205": 11.91214, + "3210": 11.9243, + "3215": 11.8472, + "3220": 11.86665, + "3225": 11.89836, + "3230": 11.86299, + "3235": 11.89396, + "3240": 11.87482, + "3245": 11.86774, + "3250": 11.86673, + "3255": 11.88133, + "3260": 11.9014, + "3265": 11.92289, + "3270": 11.98401, + "3275": 11.95198, + "3280": 11.87392, + "3285": 11.89268, + "3290": 11.88963, + "3295": 11.91043, + "3300": 11.89803, + "3305": 11.87011, + "3310": 11.84465, + "3315": 11.84015, + "3320": 11.88334, + "3325": 11.93368, + "3330": 11.83472, + "3335": 11.86862, + "3340": 11.87575, + "3345": 11.94875, + "3350": 11.93528, + "3355": 11.81967, + "3360": 11.95954, + "3365": 11.88024, + "3370": 11.88333, + "3375": 11.85751, + "3380": 11.88742, + "3385": 11.9179, + "3390": 11.83242, + "3395": 11.96084, + "3400": 11.88213, + "3405": 11.86112, + "3410": 11.8407, + "3415": 11.92255, + "3420": 11.91997, + "3425": 11.88372, + "3430": 11.8672, + "3435": 11.85235, + "3440": 11.84935, + "3445": 11.93228, + "3450": 11.85166, + "3455": 11.9026, + "3460": 11.99596, + "3465": 11.88838, + "3470": 11.90065, + "3475": 11.92033, + "3480": 11.87265, + "3485": 11.89235, + "3490": 11.89267, + "3495": 11.97544, + "3500": 11.92819, + "3505": 11.82459, + "3510": 11.90756, + "3515": 11.92021, + "3520": 11.88124, + "3525": 11.86983, + "3530": 11.90548, + "3535": 11.94666, + "3540": 11.93322, + "3545": 11.90904, + "3550": 11.85224, + "3555": 11.886, + "3560": 11.93583, + "3565": 11.87294, + "3570": 11.86107, + "3575": 11.83618, + "3580": 11.94649, + "3585": 11.8886, + "3590": 12.01796, + "3595": 11.86065, + "3600": 11.96008, + "3605": 11.94154, + "3610": 11.91928, + "3615": 11.88551, + "3620": 11.8865, + "3625": 11.86807, + "3630": 11.98152, + "3635": 11.87685, + "3640": 11.89995, + "3645": 11.86485, + "3650": 11.94291, + "3655": 11.86472, + "3660": 11.84946, + "3665": 11.90789, + "3670": 11.86396, + "3675": 12.07226, + "3680": 11.8654, + "3685": 11.90154, + "3690": 11.87282, + "3695": 11.84993, + "3700": 11.92847, + "3705": 11.85848, + "3710": 11.86691, + "3715": 11.93176, + "3720": 11.86996, + "3725": 11.92665, + "3730": 11.90876, + "3735": 11.83597, + "3740": 11.8819, + "3745": 11.90119, + "3750": 11.90765, + "3755": 11.89791, + "3760": 11.91124, + "3765": 11.95606, + "3770": 11.93789, + "3775": 11.87152, + "3780": 11.89754, + "3785": 11.8704, + "3790": 11.88079, + "3795": 11.89363, + "3800": 11.88641, + "3805": 11.87724, + "3810": 11.86303, + "3815": 11.96793, + "3820": 11.97071, + "3825": 11.90678, + "3830": 11.84478, + "3835": 11.86339, + "3840": 11.84359, + "3845": 11.85381, + "3850": 11.89843, + "3855": 11.83659, + "3860": 11.8253, + "3865": 11.82796, + "3870": 11.93815, + "3875": 11.87584, + "3880": 11.85716, + "3885": 11.85848, + "3890": 11.84472, + "3895": 11.85001, + "3900": 11.90416, + "3905": 11.87723, + "3910": 11.90409, + "3915": 11.88375, + "3920": 11.9526, + "3925": 11.8796, + "3930": 11.92607, + "3935": 12.02111, + "3940": 11.89989, + "3945": 11.96829, + "3950": 11.92362, + "3955": 11.91298, + "3960": 11.93391, + "3965": 11.9977, + "3970": 11.91134, + "3975": 11.87698, + "3980": 11.84039, + "3985": 11.8296, + "3990": 11.8824, + "3995": 12.03103, + "4000": 12.53061, + "4005": 11.99032, + "4010": 11.94569, + "4015": 12.02459, + "4020": 12.05098, + "4025": 11.9408, + "4030": 11.9872, + "4035": 11.91882, + "4040": 11.91053, + "4045": 11.94764, + "4050": 11.96252, + "4055": 11.92924, + "4060": 11.95584, + "4065": 11.96477, + "4070": 11.95333, + "4075": 11.95009, + "4080": 11.94196, + "4085": 11.96679, + "4090": 12.09863, + "4095": 12.09521, + "4100": 11.99854, + "4105": 12.05345, + "4110": 11.99127, + "4115": 12.05731, + "4120": 11.95072, + "4125": 12.09249, + "4130": 12.04972, + "4135": 11.892, + "4140": 11.93048, + "4145": 11.92862, + "4150": 12.00088, + "4155": 11.95542, + "4160": 12.01499, + "4165": 11.90691, + "4170": 11.99204, + "4175": 12.02661, + "4180": 12.08762, + "4185": 11.93626, + "4190": 11.96513, + "4195": 11.9247, + "4200": 11.89449, + "4205": 11.95353, + "4210": 11.90984, + "4215": 11.92857, + "4220": 11.99809, + "4225": 12.01358, + "4230": 12.00065, + "4235": 11.95146, + "4240": 12.12674, + "4245": 11.99718, + "4250": 11.98808, + "4255": 11.95388, + "4260": 11.91437, + "4265": 11.97358, + "4270": 11.99013, + "4275": 11.95746, + "4280": 11.9273, + "4285": 11.92873, + "4290": 11.94103, + "4295": 11.93054, + "4300": 11.92986, + "4305": 12.11627, + "4310": 11.95471, + "4315": 11.96985, + "4320": 12.03911, + "4325": 12.01041, + "4330": 11.93084, + "4335": 11.95171, + "4340": 12.03209, + "4345": 11.94503, + "4350": 11.95426, + "4355": 12.08714, + "4360": 12.18212, + "4365": 11.94575, + "4370": 11.96598, + "4375": 12.00939, + "4380": 12.08808, + "4385": 11.9772, + "4390": 12.02704, + "4395": 12.01062, + "4400": 11.94619, + "4405": 11.98609, + "4410": 11.98025, + "4415": 11.99156, + "4420": 11.96913, + "4425": 12.02991, + "4430": 11.98417, + "4435": 12.07654, + "4440": 12.09429, + "4445": 11.9962, + "4450": 11.91032, + "4455": 11.99724, + "4460": 11.94549, + "4465": 11.92313, + "4470": 11.98709, + "4475": 11.9946, + "4480": 12.041, + "4485": 11.98684, + "4490": 12.00793, + "4495": 11.96519, + "4500": 11.91768, + "4505": 11.93855, + "4510": 11.96344, + "4515": 11.93266, + "4520": 11.99772, + "4525": 12.00265, + "4530": 12.00144, + "4535": 11.93099, + "4540": 11.9976, + "4545": 12.04415, + "4550": 11.92104, + "4555": 11.97762, + "4560": 12.05513, + "4565": 12.08413, + "4570": 12.00561, + "4575": 12.03402, + "4580": 12.07435, + "4585": 11.91157, + "4590": 11.93266, + "4595": 12.00575, + "4600": 11.98764, + "4605": 12.07608, + "4610": 11.98608, + "4615": 12.23058, + "4620": 11.96992, + "4625": 11.98931, + "4630": 11.92725, + "4635": 11.94909, + "4640": 11.94336, + "4645": 11.95955, + "4650": 11.99978, + "4655": 11.95199, + "4660": 11.97643, + "4665": 12.03686, + "4670": 12.0499, + "4675": 11.98439, + "4680": 12.00394, + "4685": 11.97515, + "4690": 11.95102, + "4695": 12.07552, + "4700": 11.9222, + "4705": 11.97387, + "4710": 11.99203, + "4715": 11.93004, + "4720": 11.97237, + "4725": 12.00277, + "4730": 12.00835, + "4735": 11.97435, + "4740": 11.98233, + "4745": 11.92423, + "4750": 11.95154, + "4755": 12.02084, + "4760": 11.94378, + "4765": 11.95313, + "4770": 11.92338, + "4775": 11.92352, + "4780": 12.00277, + "4785": 11.94768, + "4790": 11.97296, + "4795": 11.98757, + "4800": 12.26361, + "4805": 11.90736, + "4810": 11.9844, + "4815": 12.04212, + "4820": 11.98762, + "4825": 12.89959, + "4830": 11.9442, + "4835": 12.35106, + "4840": 11.93828, + "4845": 11.92418, + "4850": 11.96443, + "4855": 12.03431, + "4860": 12.04422, + "4865": 11.9646, + "4870": 11.91857, + "4875": 11.95672, + "4880": 11.9198, + "4885": 11.96783, + "4890": 11.94953, + "4895": 11.96692, + "4900": 12.04475, + "4905": 12.05877, + "4910": 12.15039, + "4915": 12.15039, + "4920": 11.95008, + "4925": 11.96843, + "4930": 11.958, + "4935": 11.98531, + "4940": 11.90874, + "4945": 11.95752, + "4950": 12.01284, + "4955": 11.97799, + "4960": 11.99989, + "4965": 11.9277, + "4970": 12.06095, + "4975": 11.95713, + "4980": 12.02719, + "4985": 11.96446, + "4990": 11.92043, + "4995": 11.99522, + "5000": 12.0792, + "5005": 11.95462, + "5010": 18.30939, + "5015": 12.57034, + "5020": 12.13652, + "5025": 11.95064, + "5030": 11.93538, + "5035": 12.01779, + "5040": 11.8639, + "5045": 11.89312, + "5050": 11.93054, + "5055": 11.89904, + "5060": 11.88635, + "5065": 11.89505, + "5070": 11.95957, + "5075": 11.96591, + "5080": 11.85594, + "5085": 11.87343, + "5090": 11.89162, + "5095": 11.9231, + "5100": 11.9213, + "5105": 11.9793, + "5110": 11.92942, + "5115": 11.87025, + "5120": 11.84167, + "5125": 11.92967, + "5130": 11.90523, + "5135": 11.8727, + "5140": 11.95822, + "5145": 11.97795, + "5150": 11.90614, + "5155": 11.88276, + "5160": 11.94188, + "5165": 11.91373, + "5170": 12.01192, + "5175": 11.85511, + "5180": 11.84375, + "5185": 11.88965, + "5190": 11.88542, + "5195": 11.85346, + "5200": 11.94188, + "5205": 11.92082, + "5210": 11.8821, + "5215": 11.92239, + "5220": 11.90608, + "5225": 11.8947, + "5230": 11.88619, + "5235": 11.8948, + "5240": 11.89599, + "5245": 11.88662, + "5250": 11.95415, + "5255": 11.96527, + "5260": 11.89009, + "5265": 11.87997, + "5270": 11.94016, + "5275": 11.89138, + "5280": 11.90447, + "5285": 11.86453, + "5290": 11.90845, + "5295": 11.89373, + "5300": 11.96084, + "5305": 12.00505, + "5310": 11.87874, + "5315": 11.94047, + "5320": 11.90115, + "5325": 11.8657, + "5330": 11.98456, + "5335": 11.89142, + "5340": 11.94056, + "5345": 11.88326, + "5350": 12.02941, + "5355": 11.94937, + "5360": 11.84158, + "5365": 11.85236, + "5370": 11.89414, + "5375": 11.92681, + "5380": 11.89983, + "5385": 11.93247, + "5390": 11.88545, + "5395": 11.85963, + "5400": 11.87187, + "5405": 11.92558, + "5410": 11.94364, + "5415": 11.9087, + "5420": 11.86332, + "5425": 11.92767, + "5430": 11.87425, + "5435": 11.91049, + "5440": 11.87699, + "5445": 11.93171, + "5450": 11.90161, + "5455": 11.921, + "5460": 11.88038, + "5465": 11.91315, + "5470": 11.89728, + "5475": 11.95689, + "5480": 11.98965, + "5485": 11.91576, + "5490": 11.89757, + "5495": 11.93064, + "5500": 11.88252, + "5505": 11.96073, + "5510": 11.86654, + "5515": 11.87886, + "5520": 11.90936, + "5525": 12.03373, + "5530": 11.90318, + "5535": 11.92154, + "5540": 11.90086, + "5545": 11.89022, + "5550": 11.90225, + "5555": 11.83513, + "5560": 11.91062, + "5565": 11.87125, + "5570": 11.87145, + "5575": 11.86357, + "5580": 11.91841, + "5585": 11.92436, + "5590": 11.9023, + "5595": 11.86709, + "5600": 11.91375, + "5605": 11.90872, + "5610": 11.8916, + "5615": 11.95578, + "5620": 11.89294, + "5625": 11.90784, + "5630": 11.92391, + "5635": 11.89956, + "5640": 11.89869, + "5645": 11.91776, + "5650": 11.9431, + "5655": 11.89517, + "5660": 11.88968, + "5665": 11.89529, + "5670": 11.91051, + "5675": 11.91888, + "5680": 11.90991, + "5685": 11.93985, + "5690": 11.90708, + "5695": 11.8876, + "5700": 11.95923, + "5705": 11.93355, + "5710": 11.87364, + "5715": 11.9268, + "5720": 11.98226, + "5725": 11.87678, + "5730": 11.83368, + "5735": 11.89468, + "5740": 11.90674, + "5745": 11.88476, + "5750": 11.86646, + "5755": 11.88929, + "5760": 11.85649, + "5765": 11.85565, + "5770": 11.93646, + "5775": 11.90704, + "5780": 12.04897, + "5785": 11.91885, + "5790": 11.90414, + "5795": 11.92795, + "5800": 11.9484, + "5805": 11.9947, + "5810": 11.88562, + "5815": 11.89893, + "5820": 11.86069, + "5825": 11.85602, + "5830": 11.90577, + "5835": 11.90369, + "5840": 11.95291, + "5845": 11.93547, + "5850": 11.89776, + "5855": 11.89365, + "5860": 11.88809, + "5865": 11.89502, + "5870": 11.90093, + "5875": 11.89463, + "5880": 11.85877, + "5885": 11.91775, + "5890": 11.9362, + "5895": 11.90238, + "5900": 11.89416, + "5905": 11.9161, + "5910": 11.91617, + "5915": 11.89704, + "5920": 11.86193, + "5925": 11.94942, + "5930": 11.85147, + "5935": 11.87033, + "5940": 11.9311, + "5945": 11.96348, + "5950": 11.96932, + "5955": 11.90137, + "5960": 11.87563, + "5965": 11.86128, + "5970": 11.99512, + "5975": 11.92846, + "5980": 11.83738, + "5985": 11.88075, + "5990": 11.89265, + "5995": 11.92537, + "6000": 11.88009, + "6005": 11.9523, + "6010": 11.93509, + "6015": 11.89766, + "6020": 11.88045, + "6025": 11.87641, + "6030": 246.60413, + "6035": 12.33879, + "6040": 11.91607, + "6045": 11.95709, + "6050": 11.93381, + "6055": 11.91355, + "6060": 11.91286, + "6065": 11.97819, + "6070": 11.93373, + "6075": 11.85049, + "6080": 11.96747, + "6085": 11.93318, + "6090": 11.93239, + "6095": 11.8622, + "6100": 11.88525, + "6105": 11.97899, + "6110": 11.91577, + "6115": 11.92755, + "6120": 11.92296, + "6125": 11.99725, + "6130": 11.97753, + "6135": 11.92108, + "6140": 11.91607, + "6145": 11.9071, + "6150": 11.92499, + "6155": 11.91611, + "6160": 12.01604, + "6165": 11.89838, + "6170": 11.90254, + "6175": 11.96493, + "6180": 11.84452, + "6185": 11.91052, + "6190": 11.8712, + "6195": 11.90582, + "6200": 11.90605, + "6205": 11.98397, + "6210": 11.92035, + "6215": 11.96579, + "6220": 11.99275, + "6225": 11.88749, + "6230": 11.89369, + "6235": 11.95748, + "6240": 11.93057, + "6245": 11.94912, + "6250": 11.9372, + "6255": 11.90439, + "6260": 11.92527, + "6265": 11.95201, + "6270": 11.9095, + "6275": 11.97821, + "6280": 11.94458, + "6285": 11.90287, + "6290": 11.89278, + "6295": 11.96073, + "6300": 11.90554, + "6305": 11.88653, + "6310": 11.8962, + "6315": 11.93036, + "6320": 11.95396, + "6325": 11.94894, + "6330": 12.04569, + "6335": 11.88055, + "6340": 11.91066, + "6345": 11.89024, + "6350": 11.89994, + "6355": 11.92221, + "6360": 11.92333, + "6365": 11.91761, + "6370": 11.97313, + "6375": 11.90689, + "6380": 12.08922, + "6385": 11.94942, + "6390": 11.91702, + "6395": 11.90139, + "6400": 11.89012, + "6405": 11.9541, + "6410": 12.00044, + "6415": 11.89967, + "6420": 11.86695, + "6425": 11.87294, + "6430": 11.89524, + "6435": 11.94881, + "6440": 11.91361, + "6445": 11.91243, + "6450": 11.90246, + "6455": 11.88301, + "6460": 11.94133, + "6465": 11.95353, + "6470": 11.93545, + "6475": 11.91767, + "6480": 11.904, + "6485": 11.97366, + "6490": 11.9268, + "6495": 11.92497, + "6500": 12.05293, + "6505": 11.83715, + "6510": 11.86732, + "6515": 11.90038, + "6520": 11.86776, + "6525": 11.86971, + "6530": 11.85789, + "6535": 11.88616, + "6540": 11.85825, + "6545": 11.82803, + "6550": 11.89596, + "6555": 11.89246, + "6560": 11.87827, + "6565": 11.87369, + "6570": 11.88103, + "6575": 11.86696, + "6580": 11.90165, + "6585": 11.85113, + "6590": 11.85101, + "6595": 11.80896, + "6600": 11.90596, + "6605": 11.87406, + "6610": 11.8658, + "6615": 11.86475, + "6620": 11.88848, + "6625": 11.85675, + "6630": 11.84722, + "6635": 11.83752, + "6640": 11.8855, + "6645": 11.91332, + "6650": 11.86288, + "6655": 11.89588, + "6660": 11.8071, + "6665": 11.84093, + "6670": 11.88653, + "6675": 11.88047, + "6680": 11.87018, + "6685": 11.8411, + "6690": 11.82244, + "6695": 11.86596, + "6700": 11.85423, + "6705": 11.86228, + "6710": 11.86517, + "6715": 11.87189, + "6720": 11.84138, + "6725": 11.88097, + "6730": 11.90906, + "6735": 11.91578, + "6740": 11.88058, + "6745": 11.88169, + "6750": 12.03575, + "6755": 11.84511, + "6760": 11.84038, + "6765": 11.83499, + "6770": 11.87927, + "6775": 11.81349, + "6780": 13.01048, + "6785": 11.81032, + "6790": 11.93614, + "6795": 11.97801, + "6800": 11.86, + "6805": 11.83039, + "6810": 11.8441, + "6815": 11.89187, + "6820": 11.87841, + "6825": 11.86012, + "6830": 11.83442, + "6835": 11.85081, + "6840": 11.83799, + "6845": 11.82691, + "6850": 11.89092, + "6855": 11.82022, + "6860": 11.8279, + "6865": 11.79814, + "6870": 11.83217, + "6875": 11.90136, + "6880": 11.85295, + "6885": 11.84058, + "6890": 11.84482, + "6895": 11.82768, + "6900": 11.88337, + "6905": 11.84656, + "6910": 11.90272, + "6915": 11.8005, + "6920": 11.93804, + "6925": 12.00166, + "6930": 11.88293, + "6935": 11.9479, + "6940": 11.85228, + "6945": 11.86242, + "6950": 11.83582, + "6955": 11.81523, + "6960": 11.75894, + "6965": 11.81699, + "6970": 11.85282, + "6975": 11.84727, + "6980": 11.84729, + "6985": 12.01189, + "6990": 11.86887, + "6995": 11.88713, + "7000": 11.85612, + "7005": 11.86648, + "7010": 11.8888, + "7015": 11.84573, + "7020": 11.77395, + "7025": 11.85096, + "7030": 11.86323, + "7035": 11.84315, + "7040": 11.82293, + "7045": 11.81241, + "7050": 11.85808, + "7055": 11.86593, + "7060": 11.87475, + "7065": 11.90707, + "7070": 11.9358, + "7075": 11.84297, + "7080": 11.80853, + "7085": 11.88178, + "7090": 11.87836, + "7095": 11.85532, + "7100": 11.89414, + "7105": 11.85379, + "7110": 11.89642, + "7115": 11.85858, + "7120": 11.90327, + "7125": 11.89711, + "7130": 11.89177, + "7135": 11.88659, + "7140": 11.85757, + "7145": 11.87756, + "7150": 11.88577, + "7155": 11.86153, + "7160": 11.92297, + "7165": 11.88396, + "7170": 11.85778, + "7175": 11.91483, + "7180": 11.86232, + "7185": 11.87476, + "7190": 11.8982, + "7195": 11.88516, + "7200": 11.88158, + "7205": 11.88444, + "7210": 11.89206, + "7215": 11.87279, + "7220": 11.90742, + "7225": 11.85079, + "7230": 11.8483, + "7235": 11.90312, + "7240": 11.87181, + "7245": 11.91535, + "7250": 11.87908, + "7255": 11.92293, + "7260": 11.84549, + "7265": 11.8901, + "7270": 11.84322, + "7275": 11.848, + "7280": 11.8967, + "7285": 11.89986, + "7290": 11.95382, + "7295": 11.90753, + "7300": 11.86218, + "7305": 11.85436, + "7310": 11.85753, + "7315": 11.9134, + "7320": 11.90034, + "7325": 11.83407, + "7330": 11.85974, + "7335": 11.90032, + "7340": 11.88835, + "7345": 11.88443, + "7350": 11.85147, + "7355": 11.86003, + "7360": 11.88911, + "7365": 11.88721, + "7370": 11.94597, + "7375": 11.88507, + "7380": 11.8675, + "7385": 11.88615, + "7390": 11.85493, + "7395": 11.9078, + "7400": 11.89976, + "7405": 11.94755, + "7410": 11.86216, + "7415": 11.81832, + "7420": 11.89699, + "7425": 11.90201, + "7430": 11.88324, + "7435": 11.84242, + "7440": 11.89387, + "7445": 11.85554, + "7450": 11.927, + "7455": 11.89196, + "7460": 11.93241, + "7465": 11.89671, + "7470": 11.8633, + "7475": 11.85785, + "7480": 11.86619, + "7485": 11.90047, + "7490": 11.93453, + "7495": 11.89595, + "7500": 11.92255, + "7505": 11.86705, + "7510": 11.86492, + "7515": 11.83778, + "7520": 12.43308, + "7525": 11.94046, + "7530": 12.11911, + "7535": 11.95645, + "7540": 12.01144, + "7545": 11.94459, + "7550": 12.00989, + "7555": 11.95308, + "7560": 12.02894, + "7565": 12.00926, + "7570": 11.88032, + "7575": 11.94986, + "7580": 11.94673, + "7585": 11.92777, + "7590": 11.96311, + "7595": 11.90291, + "7600": 11.96776, + "7605": 11.91009, + "7610": 11.98945, + "7615": 11.943, + "7620": 11.97203, + "7625": 11.87696, + "7630": 11.92313, + "7635": 11.9056, + "7640": 11.89922, + "7645": 11.93063, + "7650": 11.89735, + "7655": 11.93078, + "7660": 11.95494, + "7665": 11.91011, + "7670": 11.97093, + "7675": 11.97514, + "7680": 11.93177, + "7685": 11.8992, + "7690": 11.94571, + "7695": 11.92277, + "7700": 11.94906, + "7705": 11.92727, + "7710": 11.93604, + "7715": 11.92305, + "7720": 11.93766, + "7725": 11.95622, + "7730": 11.90603, + "7735": 11.91132, + "7740": 11.97695, + "7745": 11.96601, + "7750": 11.88967, + "7755": 11.93644, + "7760": 11.96688, + "7765": 11.92672, + "7770": 23.39259, + "7775": 23.06567, + "7780": 11.93112, + "7785": 11.93477, + "7790": 11.94106, + "7795": 11.94556, + "7800": 12.0002, + "7805": 11.97342, + "7810": 11.95163, + "7815": 11.96208, + "7820": 11.96513, + "7825": 11.93368, + "7830": 11.91708, + "7835": 11.89017, + "7840": 11.94549, + "7845": 11.96002, + "7850": 11.95829, + "7855": 11.92186, + "7860": 11.93832, + "7865": 11.889, + "7870": 11.96191, + "7875": 12.05703, + "7880": 11.97288, + "7885": 11.91666, + "7890": 11.93728, + "7895": 11.96047, + "7900": 11.9818, + "7905": 11.92242, + "7910": 11.97684, + "7915": 11.91154, + "7920": 11.96828, + "7925": 11.94506, + "7930": 11.93465, + "7935": 11.90216, + "7940": 11.91383, + "7945": 11.91481, + "7950": 11.96693, + "7955": 11.94446, + "7960": 11.92358, + "7965": 11.94155, + "7970": 11.95822, + "7975": 12.03469, + "7980": 11.94102, + "7985": 11.94681, + "7990": 11.92459, + "7995": 11.92763, + "8000": 11.96299, + "8005": 11.9788, + "8010": 11.96826, + "8015": 12.02982, + "8020": 11.94329, + "8025": 11.98105, + "8030": 12.01501, + "8035": 11.96502, + "8040": 11.97586, + "8045": 11.96948, + "8050": 11.92611, + "8055": 11.93414, + "8060": 11.93961, + "8065": 11.9262, + "8070": 11.9178, + "8075": 11.90325, + "8080": 11.93833, + "8085": 11.97936, + "8090": 11.99724, + "8095": 11.94796, + "8100": 11.9625, + "8105": 11.94798, + "8110": 11.92353, + "8115": 11.96357, + "8120": 11.92451, + "8125": 11.89352, + "8130": 11.97563, + "8135": 11.97236, + "8140": 11.9723, + "8145": 11.92641, + "8150": 11.89834, + "8155": 11.94876, + "8160": 11.95465, + "8165": 11.95874, + "8170": 11.93402, + "8175": 11.96745, + "8180": 11.91172, + "8185": 11.91331, + "8190": 11.95504, + "8195": 11.94346, + "8200": 11.95192, + "8205": 11.9973, + "8210": 11.95023, + "8215": 12.03521, + "8220": 11.96486, + "8225": 11.95464, + "8230": 11.96151, + "8235": 11.95994, + "8240": 11.97909, + "8245": 11.92928, + "8250": 11.92518, + "8255": 11.94881, + "8260": 11.907, + "8265": 11.93185, + "8270": 11.9211, + "8275": 11.86366, + "8280": 12.00914, + "8285": 11.97086, + "8290": 11.98208, + "8295": 11.92309, + "8300": 11.94129, + "8305": 11.99302, + "8310": 11.97601, + "8315": 11.88862, + "8320": 11.96454, + "8325": 11.89961, + "8330": 11.99534, + "8335": 11.91687, + "8340": 11.96466, + "8345": 11.93152, + "8350": 11.94368, + "8355": 11.92235, + "8360": 11.99578, + "8365": 11.90045, + "8370": 11.91744, + "8375": 11.92667, + "8380": 11.90428, + "8385": 11.94828, + "8390": 11.93507, + "8395": 11.9473, + "8400": 11.94267, + "8405": 11.93414, + "8410": 11.90959, + "8415": 11.92941, + "8420": 11.91201, + "8425": 11.91625, + "8430": 11.9332, + "8435": 11.99456, + "8440": 11.8869, + "8445": 11.90729, + "8450": 11.93362, + "8455": 11.96619, + "8460": 12.01359, + "8465": 11.9429, + "8470": 11.99594, + "8475": 11.95465, + "8480": 11.92489, + "8485": 11.92415, + "8490": 11.97388, + "8495": 11.89913, + "8500": 11.95945, + "8505": 11.91567, + "8510": 11.91482, + "8515": 11.93548, + "8520": 11.95743, + "8525": 11.94743, + "8530": 12.42097, + "8535": 11.9272, + "8540": 12.09436, + "8545": 12.04967, + "8550": 11.9651, + "8555": 12.03857, + "8560": 11.97265, + "8565": 11.91082, + "8570": 11.95406, + "8575": 11.94802, + "8580": 11.9942, + "8585": 11.96288, + "8590": 11.95701, + "8595": 11.97786, + "8600": 11.89715, + "8605": 11.93644, + "8610": 11.98611, + "8615": 11.91557, + "8620": 11.92076, + "8625": 11.96113, + "8630": 11.99266, + "8635": 11.93916, + "8640": 12.02781, + "8645": 11.99006, + "8650": 11.91164, + "8655": 11.91924, + "8660": 11.95194, + "8665": 12.00021, + "8670": 11.90972, + "8675": 11.96086, + "8680": 11.95175, + "8685": 11.95495, + "8690": 12.00198, + "8695": 12.07659, + "8700": 11.96371, + "8705": 11.91845, + "8710": 11.97745, + "8715": 11.93805, + "8720": 11.9173, + "8725": 11.91035, + "8730": 12.01393, + "8735": 11.98447, + "8740": 11.97475, + "8745": 11.96291, + "8750": 11.9361, + "8755": 11.96838, + "8760": 11.93695, + "8765": 12.00162, + "8770": 11.92599, + "8775": 12.0012, + "8780": 12.03738, + "8785": 11.94909, + "8790": 11.90577, + "8795": 11.97012, + "8800": 11.93035, + "8805": 11.99893, + "8810": 11.94421, + "8815": 11.98191, + "8820": 11.99062, + "8825": 11.92267, + "8830": 11.95194, + "8835": 11.937, + "8840": 11.97075, + "8845": 11.95007, + "8850": 12.02522, + "8855": 11.94712, + "8860": 11.96728, + "8865": 11.89285, + "8870": 11.94189, + "8875": 11.92065, + "8880": 11.98822, + "8885": 11.98285, + "8890": 11.99582, + "8895": 11.96596, + "8900": 11.94354, + "8905": 11.95473, + "8910": 11.99259, + "8915": 11.96618, + "8920": 11.93587, + "8925": 11.99413, + "8930": 12.00638, + "8935": 11.93, + "8940": 11.95031, + "8945": 11.91928, + "8950": 11.9941, + "8955": 11.94031, + "8960": 11.96914, + "8965": 11.95062, + "8970": 11.95268, + "8975": 12.03161, + "8980": 11.97245, + "8985": 12.01027, + "8990": 11.9446, + "8995": 11.96843, + "9000": 11.9429, + "9005": 11.94091, + "9010": 11.93667, + "9015": 11.95344, + "9020": 11.93207, + "9025": 11.91998, + "9030": 11.92651, + "9035": 11.97131, + "9040": 11.92008, + "9045": 11.9777, + "9050": 11.93287, + "9055": 11.96682, + "9060": 11.982, + "9065": 11.9763, + "9070": 11.92703, + "9075": 11.95149, + "9080": 11.94863, + "9085": 11.92217, + "9090": 11.92326, + "9095": 11.9586, + "9100": 11.93403, + "9105": 11.97708, + "9110": 11.97248, + "9115": 11.91899, + "9120": 11.98175, + "9125": 12.0043, + "9130": 11.98361, + "9135": 11.95811, + "9140": 11.89116, + "9145": 11.92833, + "9150": 11.96999, + "9155": 11.95682, + "9160": 11.93898, + "9165": 11.98676, + "9170": 11.96776, + "9175": 11.91735, + "9180": 11.96488, + "9185": 11.93801, + "9190": 11.93829, + "9195": 11.96444, + "9200": 11.91924, + "9205": 11.99554, + "9210": 11.91977, + "9215": 11.99739, + "9220": 11.92053, + "9225": 11.93702, + "9230": 11.95815, + "9235": 12.05346, + "9240": 11.9596, + "9245": 11.97173, + "9250": 11.94092, + "9255": 11.94632, + "9260": 12.00354, + "9265": 11.96854, + "9270": 11.91621, + "9275": 11.94709, + "9280": 11.93375, + "9285": 11.92465, + "9290": 11.93047, + "9295": 11.93184, + "9300": 11.95538, + "9305": 11.96102, + "9310": 11.93874, + "9315": 11.94123, + "9320": 11.95854, + "9325": 11.98961, + "9330": 11.87394, + "9335": 11.97986, + "9340": 12.02583, + "9345": 11.94202, + "9350": 12.00113, + "9355": 11.97405, + "9360": 11.96746, + "9365": 11.96018, + "9370": 11.9475, + "9375": 11.94327, + "9380": 11.92135, + "9385": 12.01574, + "9390": 11.95494, + "9395": 11.93529, + "9400": 11.96463, + "9405": 11.9807, + "9410": 11.92926, + "9415": 11.95919, + "9420": 11.94796, + "9425": 11.94261, + "9430": 11.94968, + "9435": 11.9655, + "9440": 11.94016, + "9445": 11.98541, + "9450": 11.94602, + "9455": 11.96365, + "9460": 11.9884, + "9465": 11.93962, + "9470": 11.93471, + "9475": 11.91073, + "9480": 11.92557, + "9485": 11.93537, + "9490": 11.97267, + "9495": 11.93521, + "9500": 11.92542, + "9505": 12.00627, + "9510": 11.9749, + "9515": 11.97511, + "9520": 11.88493, + "9525": 11.91739, + "9530": 11.92418, + "9535": 11.97024 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml new file mode 100644 index 00000000000..1ad8597d932 --- /dev/null +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml @@ -0,0 +1,169 @@ +# The proxy model is used for local code quality check. +# The proxy model should contain all the necessary components and settings but fewer parameters. +ENV_VARS: + TORCH_NCCL_AVOID_RECORD_STREAMS: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + NVTE_FUSED_ATTN: 1 + NVTE_NORM_FWD_USE_CUDNN: 1 + NVTE_NORM_BWD_USE_CUDNN: 1 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION + NON_DETERMINSTIC_RESULTS: 1 + NVSHMEM_IB_ENABLE_IBGDA: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 16 + USE_MNNVL: 1 +TEST_TYPE: "release" +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 4 + --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL + --expert-model-parallel-size: 16 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + + # Training args + --use-mcore-models: true + --sequence-parallel: true + --use-flash-attn: true + --disable-bias-linear: true + --micro-batch-size: 1 + --global-batch-size: 512 + --train-samples: 24414062 + --exit-duration-in-mins: 220 + --no-check-for-nan-in-loss-and-grad: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: te + --manual-gc: true + --manual-gc-interval: 10 + + # Transformer Engine args + --transformer-impl: transformer_engine + + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --tokenizer-type: GPTSentencePieceTokenizer + --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model + --data-path: $DATA_BLEND + --split: 99,1,0 + --no-mmap-bin-files: true + --no-create-attention-mask-in-dataloader: true + --num-workers: 6 + + # Add network size args + --num-layers: 14 # original 61 layers + --hidden-size: 7168 + --ffn-hidden-size: 18432 + --num-attention-heads: 128 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + + # Add learning rate args + --lr-decay-samples: 24413696 + --lr-warmup-samples: 1536000 + --lr-warmup-init: 1e-7 + --lr: 1e-5 + --min-lr: 1e-6 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + + # Add MoE args + --num-experts: 64 # local 4 + 1 shared, EP16 + --moe-layer-freq: ([0]*3+[1]*11) + --moe-ffn-hidden-size: 2048 + --moe-shared-expert-intermediate-size: 2048 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 8 + --moe-token-dispatcher-type: flex + --moe-flex-dispatcher-backend: hybridep + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 4 + --moe-router-num-groups: 8 + --moe-router-topk-scaling-factor: 2.5 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + + # Add checkpointing args + --auto-detect-ckpt-format: + true + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 500 + --save-retain-interval: 10000 + --dist-ckpt-strictness: log_all + + # Add initialization args + --init-method-std: 0.02 + + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + --wandb-project: megatron-core-release-runs + --wandb-entity: adlr + --wandb-exp-name: ${WANDB_EXPERIMENT} + --wandb-save-dir: ${WANDB_SAVE_PATH} + + # Add mixed precision args + --bf16: true + + # enable experimental + --enable-experimental: true + --exit-interval: 9536 +METRICS: + - "iteration-time" + - "lm loss" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml index 9c7d2496e2a..cc8f2b814c2 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml @@ -13,7 +13,7 @@ ENV_VARS: NON_DETERMINSTIC_RESULTS: 1 NVSHMEM_IB_ENABLE_IBGDA: 0 CUDA_DEVICE_MAX_CONNECTIONS: 1 -TEST_TYPE: 'release' +TEST_TYPE: "release" MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -150,6 +150,7 @@ MODEL_ARGS: --logging-level: 40 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} @@ -160,7 +161,7 @@ MODEL_ARGS: --enable-experimental: true --exit-interval: 9536 METRICS: - - 'iteration-time' - - 'lm loss' - - 'mem-allocated-bytes' - - 'mem-max-allocated-bytes' + - "iteration-time" + - "lm loss" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml new file mode 100644 index 00000000000..ced409e5b1e --- /dev/null +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml @@ -0,0 +1,168 @@ +# The proxy model is used for local code quality check. +# The proxy model should contain all the necessary components and settings but fewer parameters. +ENV_VARS: + TORCH_NCCL_AVOID_RECORD_STREAMS: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + NVTE_FUSED_ATTN: 1 + NVTE_NORM_FWD_USE_CUDNN: 1 + NVTE_NORM_BWD_USE_CUDNN: 1 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION + NON_DETERMINSTIC_RESULTS: 1 + NVSHMEM_IB_ENABLE_IBGDA: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 16 + USE_MNNVL: 1 +TEST_TYPE: "release" +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 4 + --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL + --expert-model-parallel-size: 16 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + + # Training args + --use-mcore-models: true + --sequence-parallel: true + --use-flash-attn: true + --disable-bias-linear: true + --micro-batch-size: 1 + --global-batch-size: 512 + --train-samples: 24414062 + --exit-duration-in-mins: 220 + --no-check-for-nan-in-loss-and-grad: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: te + --manual-gc: true + --manual-gc-interval: 10 + + # Transformer Engine args + --transformer-impl: transformer_engine + + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --tokenizer-type: GPTSentencePieceTokenizer + --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model + --data-path: $DATA_BLEND + --split: 99,1,0 + --no-mmap-bin-files: true + --no-create-attention-mask-in-dataloader: true + --num-workers: 6 + + # Add network size args + --num-layers: 14 # original 61 layers + --hidden-size: 7168 + --ffn-hidden-size: 18432 + --num-attention-heads: 128 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + + # Add learning rate args + --lr-decay-samples: 24413696 + --lr-warmup-samples: 1536000 + --lr-warmup-init: 1e-7 + --lr: 1e-5 + --min-lr: 1e-6 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + + # Add MoE args + --num-experts: 64 # local 4 + 1 shared, EP16 + --moe-layer-freq: ([0]*3+[1]*11) + --moe-ffn-hidden-size: 2048 + --moe-shared-expert-intermediate-size: 2048 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 8 + --moe-token-dispatcher-type: flex + --moe-flex-dispatcher-backend: hybridep + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 4 + --moe-router-num-groups: 8 + --moe-router-topk-scaling-factor: 2.5 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + + # Add checkpointing args + --auto-detect-ckpt-format: + true + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 500 + --save-retain-interval: 10000 + --dist-ckpt-strictness: log_all + + # Add initialization args + --init-method-std: 0.02 + + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + --wandb-project: megatron-core-release-runs + --wandb-entity: adlr + --wandb-exp-name: ${WANDB_EXPERIMENT} + --wandb-save-dir: ${WANDB_SAVE_PATH} + + # Add mixed precision args + --bf16: true + + # enable experimental + --enable-experimental: true +METRICS: + - "iteration-time" + - "lm loss" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml index c16fedc7860..7bc14780fb3 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml @@ -12,11 +12,12 @@ ENV_VARS: NCCL_DEBUG: VERSION NON_DETERMINSTIC_RESULTS: 1 NVSHMEM_IB_ENABLE_IBGDA: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 TEST_TYPE: "release" MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 - --tensor-model-parallel-size: 1 + --tensor-model-parallel-size: 2 --pipeline-model-parallel-size: 4 --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL --expert-model-parallel-size: 16 @@ -47,8 +48,8 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --tokenizer-type: HuggingFaceTokenizer - --tokenizer-model: ${TOKENIZER_PATH} + --tokenizer-type: GPTSentencePieceTokenizer + --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND --split: 99,1,0 --no-mmap-bin-files: true @@ -81,12 +82,11 @@ MODEL_ARGS: --qk-layernorm: true # Add learning rate args - --lr-decay-samples: 584765624 + --lr-decay-samples: 24413696 --lr-warmup-samples: 1536000 - # Learning rate scaled down from 7.3e-6 (DeepSeek-V3 technical report, GBS=15360) to 3.9e-6 (GBS=8192) - --lr-warmup-init: 3.9e-7 - --lr: 3.9e-6 - --min-lr: 3.9e-7 + --lr-warmup-init: 1e-7 + --lr: 1e-5 + --min-lr: 1e-6 --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 @@ -127,8 +127,6 @@ MODEL_ARGS: --eval-interval: 200 # Add checkpointing args - --no-load-optim: true - --no-load-rng: true --auto-detect-ckpt-format: true # Add checkpointing args @@ -152,6 +150,7 @@ MODEL_ARGS: --logging-level: 40 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml index e2b8b212900..efe39998065 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml @@ -92,6 +92,7 @@ MODEL_ARGS: --log-interval: 1 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} # Add mixed precision args diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml index a02fbe99537..f4476c712f2 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml @@ -92,6 +92,7 @@ MODEL_ARGS: --log-interval: 1 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} # Add mixed precision args diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml index b43a1227ea0..cfeb7709839 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml @@ -92,6 +92,7 @@ MODEL_ARGS: --log-interval: 1 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} # Add mixed precision args diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml index 1fdad2a5c70..29dcefadf0e 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml @@ -94,6 +94,7 @@ MODEL_ARGS: --log-interval: 1 --tensorboard-dir: ${TENSORBOARD_PATH} --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} # Add mixed precision args diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml index 852fbf9819d..a7abdc1bdd4 100644 --- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -58,6 +58,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --timing-log-level: 0 --wandb-project: megatron-core-release-runs + --wandb-entity: adlr --wandb-exp-name: ${WANDB_EXPERIMENT} --wandb-save-dir: ${WANDB_SAVE_PATH} METRICS: From 14b70c72fbdb7775ad37c68d8383ad6ab6b96ba0 Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Fri, 30 Jan 2026 13:40:23 -0500 Subject: [PATCH 008/231] Nvshmem refit (#2696) Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: William Dykas Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root --- megatron/core/resharding/__init__.py | 9 +- .../core/resharding/copy_services/__init__.py | 3 +- .../copy_services/nvshmem_copy_service.py | 173 ++++++++ .../nvshmem_copy_service/__init__.py | 29 ++ .../nvshmem_copy_service/core/__init__.py | 9 + .../core/gpu_resource_manager.py | 192 +++++++++ .../core/kernel_launcher.py | 147 +++++++ .../core/pipeline_executor.py | 275 ++++++++++++ .../kernels/chunked_kernel.cu | 103 +++++ .../resharding/nvshmem_copy_service/logger.py | 209 +++++++++ .../nvshmem_copy_service/memory/__init__.py | 8 + .../memory/double_buffer_manager.py | 86 ++++ .../memory/tensor_pointer_utils.py | 45 ++ .../nvshmem_copy_service/nvshmem_types.py | 73 ++++ .../nvshmem_copy_service/planning/__init__.py | 10 + .../planning/communication_scheduler.py | 181 ++++++++ .../planning/gpu_execution_planner.py | 222 ++++++++++ .../planning/task_segmenter.py | 100 +++++ .../planning/workload_packer.py | 89 ++++ .../nvshmem_copy_service/service.py | 408 ++++++++++++++++++ .../nvshmem_copy_service/validation.py | 145 +++++++ megatron/core/resharding/refit.py | 49 ++- megatron/training/arguments.py | 6 +- .../unit_tests/resharding/test_model_swap.py | 22 +- 24 files changed, 2577 insertions(+), 16 deletions(-) create mode 100644 megatron/core/resharding/copy_services/nvshmem_copy_service.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/__init__.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/core/__init__.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu create mode 100644 megatron/core/resharding/nvshmem_copy_service/logger.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/memory/__init__.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/planning/__init__.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/service.py create mode 100644 megatron/core/resharding/nvshmem_copy_service/validation.py diff --git a/megatron/core/resharding/__init__.py b/megatron/core/resharding/__init__.py index d06484eef37..083c4518c0e 100644 --- a/megatron/core/resharding/__init__.py +++ b/megatron/core/resharding/__init__.py @@ -1,7 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. from .execution import execute_reshard_plan from .planner import build_centralized_reshard_plan -from .refit import reshard_model_weights, swap_model_weights +from .refit import ( + clear_service_cache, + get_or_create_service, + reshard_model_weights, + swap_model_weights, +) from .utils import ParameterMetadata, ReshardPlan, ShardingDescriptor, TransferOp __all__ = [ @@ -9,6 +14,8 @@ "execute_reshard_plan", "swap_model_weights", "reshard_model_weights", + "get_or_create_service", + "clear_service_cache", "ParameterMetadata", "ShardingDescriptor", "TransferOp", diff --git a/megatron/core/resharding/copy_services/__init__.py b/megatron/core/resharding/copy_services/__init__.py index 15986e4d28e..447588f7b3a 100644 --- a/megatron/core/resharding/copy_services/__init__.py +++ b/megatron/core/resharding/copy_services/__init__.py @@ -3,5 +3,6 @@ from .base import CopyService from .nccl_copy_service import NCCLCopyService +from .nvshmem_copy_service import NVSHMEMCopyService -__all__ = ["CopyService", "NCCLCopyService"] +__all__ = ["CopyService", "NCCLCopyService", "NVSHMEMCopyService"] diff --git a/megatron/core/resharding/copy_services/nvshmem_copy_service.py b/megatron/core/resharding/copy_services/nvshmem_copy_service.py new file mode 100644 index 00000000000..8d231de5339 --- /dev/null +++ b/megatron/core/resharding/copy_services/nvshmem_copy_service.py @@ -0,0 +1,173 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from __future__ import annotations + +import logging +from typing import Dict + +import torch +import torch.distributed as dist + +from ..nvshmem_copy_service import RemoteCopyService +from .base import CopyService + +logger = logging.getLogger(__name__) + + +class NVSHMEMCopyService(CopyService): + """CopyService implementation backed by NVSHMEM RemoteCopyService.""" + + def __init__(self): + if not dist.is_initialized(): + raise RuntimeError("torch.distributed must be initialized before NVSHMEMCopyService()") + + self.rank = dist.get_rank() + self._remote = RemoteCopyService() + # Lazily initialized on first use to avoid side effects at import time + self._initialized = False + + # NOTE: keep the original typed tensors here (not uint8 views) so local copies + # preserve shape/strides semantics and avoid byte-offset pitfalls. + self._local_send_ops: Dict[int, torch.Tensor] = {} + self._local_recv_ops: Dict[int, torch.Tensor] = {} + self._local_copy_stream = torch.cuda.Stream() + + logger.info("NVSHMEMCopyService constructed") + + def _ensure_initialized(self): + if not self._initialized: + self._remote.init(log_level="INFO") + self._initialized = True + logger.info( + "NVSHMEMCopyService initialized: PE %d / %d", self._remote.my_pe, self._remote.n_pes + ) + + def submit_send(self, src_tensor: torch.Tensor, dest_rank: int): + """ + Basic CopyService API is not rich enough to drive the NVSHMEM planner + (it lacks a globally shared task identifier), so this method is kept + only for interface compatibility and should not be used directly. + + The resharding path calls into NVSHMEMCopyService via the + submit_send_with_id/submit_recv_with_id helpers instead. + """ + raise RuntimeError( + "NVSHMEMCopyService.submit_send() is not supported; " + "use submit_send_with_id(...) from execute_reshard_plan." + ) + + def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int): + raise RuntimeError( + "NVSHMEMCopyService.submit_recv() is not supported; " + "use submit_recv_with_id(...) from execute_reshard_plan." + ) + + # + # New helper API used from execute_reshard_plan via monkey-patching: + # we avoid changing the existing execute_reshard_plan signature by adding + # a small adapter layer that batches up matched send/recv slices. + # + + def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, dest_rank: int): + """Register a send with an explicit, globally shared task_id.""" + self._ensure_initialized() + + if not src_tensor.is_contiguous(): + src_tensor = src_tensor.contiguous() + + # Local transfers: keep them out of RemoteCopyService entirely. + if dest_rank == self.rank: + self._local_send_ops[task_id] = src_tensor + return + + num_bytes = src_tensor.numel() * src_tensor.element_size() + src_bytes = src_tensor.view(torch.uint8) + + logger.debug( + "NVSHMEMCopyService: register_send task_id=%d, %d bytes (%d → %d)", + task_id, + num_bytes, + self.rank, + dest_rank, + ) + + # Use public API on RemoteCopyService + self._remote.register_send( + task_id=task_id, src_tensor=src_bytes, src_pos=0, size=num_bytes, dest_pe=dest_rank + ) + + def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor, src_rank: int): + """Register a recv with an explicit, globally shared task_id.""" + self._ensure_initialized() + + if not dest_tensor.is_contiguous(): + dest_tensor = dest_tensor.contiguous() + + # Local transfers: keep them out of RemoteCopyService entirely. + if src_rank == self.rank: + self._local_recv_ops[task_id] = dest_tensor + return + + num_bytes = dest_tensor.numel() * dest_tensor.element_size() + dst_bytes = dest_tensor.view(torch.uint8) + + logger.debug( + "NVSHMEMCopyService: register_recv task_id=%d, %d bytes (%d ← %d)", + task_id, + num_bytes, + self.rank, + src_rank, + ) + + self._remote.register_receive( + task_id=task_id, dest_tensor=dst_bytes, dest_pos=0, size=num_bytes, src_pe=src_rank + ) + + def run(self): + """ + Execute all registered transfer pairs via NVSHMEM. + + This converts the registered pairs into RemoteCopyService send/receive + requests, builds a schedule, runs the pipelined NVSHMEM transfer, and + then clears internal state. + """ + self._ensure_initialized() + + # 1) Run same-rank copies (match by task_id), like NCCL backend. + if self._local_send_ops or self._local_recv_ops: + missing_sends = set(self._local_recv_ops.keys()) - set(self._local_send_ops.keys()) + missing_recvs = set(self._local_send_ops.keys()) - set(self._local_recv_ops.keys()) + if missing_sends or missing_recvs: + raise RuntimeError( + "NVSHMEMCopyService: unmatched local ops on rank " + f"{self.rank}: missing_sends={sorted(list(missing_sends))[:10]} " + f"missing_recvs={sorted(list(missing_recvs))[:10]}" + ) + + with torch.no_grad(): + with torch.cuda.stream(self._local_copy_stream): + for task_id, dst in self._local_recv_ops.items(): + src = self._local_send_ops[task_id] + if src.numel() != dst.numel() or src.element_size() != dst.element_size(): + raise RuntimeError( + "NVSHMEMCopyService: local copy size mismatch on rank " + f"{self.rank} task_id={task_id}: " + f"src=({tuple(src.shape)}, {src.dtype}) " + f"dst=({tuple(dst.shape)}, {dst.dtype})" + ) + dst.copy_(src, non_blocking=True) + + torch.cuda.current_stream().wait_stream(self._local_copy_stream) + self._local_send_ops.clear() + self._local_recv_ops.clear() + + # 2) Execute remote schedule (if any remote sends/recvs were registered). + if not self._remote.send_requests and not self._remote.receive_requests: + logger.info("NVSHMEMCopyService: no remote requests; local copies complete") + return + + logger.info("NVSHMEMCopyService: building NVSHMEM schedule and executing") + self._remote.schedule() + self._remote.run() + self._remote.clear_requests() + logger.info("NVSHMEMCopyService: NVSHMEM transfers complete") diff --git a/megatron/core/resharding/nvshmem_copy_service/__init__.py b/megatron/core/resharding/nvshmem_copy_service/__init__.py new file mode 100644 index 00000000000..2ab8cde81fe --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +NVSHMEM-based remote copy service and supporting components. + +This package is an in-tree integration of the standalone +`nvshmem_copy_service/python` implementation so that Megatron +can use it without relying on an external library. +""" + +from . import nvshmem_types +from .core import GPUResourceManager, KernelLauncher, PipelineExecutor +from .memory import DoubleBufferManager, TensorPointerExtractor +from .planning import CommunicationScheduler, GPUExecutionPlanner, TaskSegmenter, WorkloadPacker +from .service import RemoteCopyService + +__all__ = [ + "RemoteCopyService", + "nvshmem_types", + "GPUResourceManager", + "KernelLauncher", + "PipelineExecutor", + "DoubleBufferManager", + "TensorPointerExtractor", + "CommunicationScheduler", + "GPUExecutionPlanner", + "TaskSegmenter", + "WorkloadPacker", +] diff --git a/megatron/core/resharding/nvshmem_copy_service/core/__init__.py b/megatron/core/resharding/nvshmem_copy_service/core/__init__.py new file mode 100644 index 00000000000..f466e925899 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/core/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Core execution components for NVSHMEM operations.""" + +from .gpu_resource_manager import GPUResourceManager +from .kernel_launcher import KernelLauncher +from .pipeline_executor import PipelineExecutor + +__all__ = ["GPUResourceManager", "KernelLauncher", "PipelineExecutor"] diff --git a/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py b/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py new file mode 100644 index 00000000000..6e03b914b26 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py @@ -0,0 +1,192 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +GPU resource management for NVSHMEM operations. + +Handles NVSHMEM initialization, CUDA device setup, stream management, +and event lifecycle. +""" + +import logging +from typing import Dict, Optional + +try: + import nvshmem.core + from cuda.core.experimental import Device + + HAVE_NVSHMEM = True +except ImportError: + HAVE_NVSHMEM = False + +import torch +import torch.distributed as dist + +logger = logging.getLogger(__name__) + + +class GPUResourceManager: + """Manages GPU resources including NVSHMEM, streams, and events.""" + + def __init__(self): + self.device = None + self.my_pe: int = -1 + self.n_pes: int = -1 + self.initialized: bool = False + + # CUDA streams (cuda.core.experimental) + self.pack_stream = None + self.unpack_stream = None + self.send_stream = None + self.copy_stream = None + + # PyTorch stream wrappers + self.torch_pack_stream = None + self.torch_unpack_stream = None + self.torch_send_stream = None + self.torch_copy_stream = None + + # Stream name to PyTorch stream mapping + self._torch_streams: Dict[str, torch.cuda.ExternalStream] = {} + + def init(self) -> None: + """ + Initialize NVSHMEM, CUDA device, and streams. + + Expects torch.distributed to be already initialized. + """ + if self.initialized: + return + + if not HAVE_NVSHMEM: + raise RuntimeError( + "nvshmem.core is not available. Please install nvshmem to use GPUResourceManager." + ) + + # torch.distributed must be initialized before calling this + if not dist.is_initialized(): + raise RuntimeError( + "torch.distributed must be initialized before " "GPUResourceManager.init()" + ) + + # Get current CUDA device (already set by caller based on LOCAL_RANK) + local_rank = torch.cuda.current_device() + + # nvshmem4py requires a cuda.core Device at init time + self.device = Device(local_rank) + self.device.set_current() + + # Extract rank, nranks from the default process group + num_ranks = dist.get_world_size() + rank_id = dist.get_rank() + + # Create/Broadcast UniqueID using broadcast_object_list + uniqueid = nvshmem.core.get_unique_id(empty=True) + if rank_id == 0: + uniqueid = nvshmem.core.get_unique_id() + broadcast_objects = [uniqueid] + else: + broadcast_objects = [None] + + # Broadcast ID to all ranks using the default group + dist.broadcast_object_list(broadcast_objects, src=0) + + # Barrier to ensure everyone has the ID before NVSHMEM init + dist.barrier() + + # Initialize NVSHMEM with the broadcasted UID + nvshmem.core.init( + device=self.device, + uid=broadcast_objects[0], + rank=rank_id, + nranks=num_ranks, + initializer_method="uid", + ) + + logger.info("NVSHMEM initialized") + + self.my_pe = nvshmem.core.my_pe() + self.n_pes = nvshmem.core.n_pes() + + # Create CUDA streams + self.pack_stream = self.device.create_stream() + self.unpack_stream = self.device.create_stream() + self.send_stream = self.device.create_stream() + self.copy_stream = self.device.create_stream() + + # Get stream pointers and create PyTorch wrappers + _, pack_stream_ptr = self.pack_stream.__cuda_stream__() + _, unpack_stream_ptr = self.unpack_stream.__cuda_stream__() + _, send_stream_ptr = self.send_stream.__cuda_stream__() + _, copy_stream_ptr = self.copy_stream.__cuda_stream__() + + self.torch_pack_stream = torch.cuda.ExternalStream(pack_stream_ptr) + self.torch_unpack_stream = torch.cuda.ExternalStream(unpack_stream_ptr) + self.torch_send_stream = torch.cuda.ExternalStream(send_stream_ptr) + self.torch_copy_stream = torch.cuda.ExternalStream(copy_stream_ptr) + + # Build stream mapping + self._torch_streams = { + "pack": self.torch_pack_stream, + "unpack": self.torch_unpack_stream, + "send": self.torch_send_stream, + "copy": self.torch_copy_stream, + } + + logger.info("Stream mapping built") + + self.initialized = True + + # Initial barrier to ensure all PEs are ready + nvshmem.core.barrier_all(stream=self.send_stream) + + def get_stream(self, name: str): + """ + Get CUDA stream by name. + + Args: + name: Stream name ('pack', 'unpack', 'send', 'copy') + + Returns: + CUDA stream object + """ + streams = { + "pack": self.pack_stream, + "unpack": self.unpack_stream, + "send": self.send_stream, + "copy": self.copy_stream, + } + return streams.get(name) + + def get_torch_stream(self, name: str) -> Optional[torch.cuda.ExternalStream]: + """ + Get PyTorch ExternalStream by name. + + Args: + name: Stream name ('pack', 'unpack', 'send', 'copy') + + Returns: + PyTorch ExternalStream + """ + return self._torch_streams.get(name) + + def create_events(self, num_events: int = 2): + """ + Create double-buffered CUDA events for pack and unpack operations. + + Args: + num_events: Number of events to create for each type + (default: 2 for double buffering) + + Returns: + tuple: (pack_events, unpack_events) lists of torch.cuda.Event + """ + pack_events = [torch.cuda.Event(enable_timing=False) for _ in range(num_events)] + unpack_events = [torch.cuda.Event(enable_timing=False) for _ in range(num_events)] + return pack_events, unpack_events + + def finalize(self) -> None: + """Cleanup resources (streams are automatically managed by CUDA).""" + self.initialized = False + self.my_pe = -1 + self.n_pes = -1 + # Streams are automatically cleaned up when objects are deleted diff --git a/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py b/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py new file mode 100644 index 00000000000..4e86d6a9505 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py @@ -0,0 +1,147 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +CUDA kernel management and launching for pack/unpack operations. + +Handles kernel compilation, launching, and stream coordination. +""" + +import os +from typing import Any, Tuple + +try: + import cupy as cp + + HAVE_CUPY = True +except ImportError: + HAVE_CUPY = False + +import torch +import torch.cuda.nvtx as nvtx + + +class KernelLauncher: + """Manages CUDA kernel loading and launching for data pack/unpack operations.""" + + def __init__(self): + self.chunked_copy_kernel = None + # Cached CuPy stream wrappers for efficient kernel launching + self.cp_pack_stream = None + self.cp_unpack_stream = None + + def load_kernels(self) -> None: + """Load and compile CUDA kernels from source.""" + if not HAVE_CUPY: + raise RuntimeError("cupy is not available. Please install cupy to use KernelLauncher.") + + current_dir = os.path.dirname(os.path.abspath(__file__)) + kernel_path = os.path.join(current_dir, "..", "kernels", "chunked_kernel.cu") + + with open(kernel_path, "r") as f: + kernel_source = f.read() + + self.chunked_copy_kernel = cp.RawKernel( + kernel_source, "chunked_batched_copy_kernel", options=("-std=c++11",) + ) + + def set_streams(self, pack_stream, unpack_stream) -> None: + """ + Cache CuPy stream wrappers for kernel launching. + + This eliminates per-launch overhead of stream pointer extraction + and CuPy ExternalStream creation. + + Args: + pack_stream: CUDA stream for pack operations + unpack_stream: CUDA stream for unpack operations + """ + _, pack_stream_ptr = pack_stream.__cuda_stream__() + _, unpack_stream_ptr = unpack_stream.__cuda_stream__() + self.cp_pack_stream = cp.cuda.ExternalStream(pack_stream_ptr) + self.cp_unpack_stream = cp.cuda.ExternalStream(unpack_stream_ptr) + + def launch_pack( + self, + gpu_plan: Tuple[Any, Any, Any, int], + pack_stream, + torch_pack_stream: torch.cuda.ExternalStream, + pack_event: torch.cuda.Event, + ) -> None: + """ + Launch pack kernel to copy data from user tensors to send buffer. + + Args: + gpu_plan: Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks) + as CuPy arrays + pack_stream: CUDA stream (cuda.core.experimental.Stream) - unused, + kept for compatibility + torch_pack_stream: PyTorch external stream wrapper + pack_event: CUDA event to record after kernel launch + """ + nvtx.range_push("Launch Pack Kernel") + if not gpu_plan: + nvtx.range_pop() + return + + # Unpack cached CuPy arrays from gpu_plan + cp_src, cp_dst, cp_sizes, num_chunks = gpu_plan + + # Grid/Block configuration + THREADS_PER_BLOCK = 1024 + NUM_BLOCKS = 75 + + # Launch kernel using cached CuPy stream + assert self.chunked_copy_kernel is not None + assert self.cp_pack_stream is not None + self.chunked_copy_kernel( + (NUM_BLOCKS,), + (THREADS_PER_BLOCK,), + (cp_src, cp_dst, cp_sizes, num_chunks), + stream=self.cp_pack_stream, + ) + nvtx.range_pop() + # Record event on PyTorch stream + pack_event.record(stream=torch_pack_stream) + + def launch_unpack( + self, + gpu_plan: Tuple[Any, Any, Any, int], + unpack_stream, + torch_unpack_stream: torch.cuda.ExternalStream, + unpack_event: torch.cuda.Event, + ) -> None: + """ + Launch unpack kernel to copy data from receive buffer to user tensors. + + Args: + gpu_plan: Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks) + as CuPy arrays + unpack_stream: CUDA stream (cuda.core.experimental.Stream) - unused, + kept for compatibility + torch_unpack_stream: PyTorch external stream wrapper + unpack_event: CUDA event to record after kernel launch + """ + nvtx.range_push("Launch Unpack Kernel") + if not gpu_plan: + nvtx.range_pop() + return + + # Unpack cached CuPy arrays from gpu_plan + cp_src, cp_dst, cp_sizes, num_chunks = gpu_plan + + # Grid/Block configuration + THREADS_PER_BLOCK = 1024 + NUM_BLOCKS = 75 + + # Launch kernel using cached CuPy stream + assert self.chunked_copy_kernel is not None + assert self.cp_unpack_stream is not None + self.chunked_copy_kernel( + (NUM_BLOCKS,), + (THREADS_PER_BLOCK,), + (cp_src, cp_dst, cp_sizes, num_chunks), + stream=self.cp_unpack_stream, + ) + nvtx.range_pop() + # Record event on PyTorch stream + unpack_event.record(stream=torch_unpack_stream) diff --git a/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py b/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py new file mode 100644 index 00000000000..5ba07f9956a --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py @@ -0,0 +1,275 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Pipelined communication execution engine. + +Orchestrates the pack/send/unpack pipeline with double-buffering +and proper stream synchronization. +""" + +from typing import Dict, List, Optional + +try: + import nvshmem.core + + HAVE_NVSHMEM = True +except ImportError: + HAVE_NVSHMEM = False + +import torch + +from ..logger import PELogger +from ..memory.double_buffer_manager import DoubleBufferManager +from ..nvshmem_types import ReceiveRequest, ScheduledBatch, SendRequest +from .kernel_launcher import KernelLauncher + + +class PipelineExecutor: + """Executes pipelined NVSHMEM communication with pack/send/unpack overlap.""" + + def __init__( + self, kernel_launcher: KernelLauncher, buffer_manager: DoubleBufferManager, my_pe: int + ): + """ + Initialize pipeline executor. + + Args: + kernel_launcher: KernelLauncher instance for pack/unpack kernels + buffer_manager: DoubleBufferManager for send/recv buffers + my_pe: This PE's rank + """ + self.kernel_launcher = kernel_launcher + self.buffer_manager = buffer_manager + self.my_pe = my_pe + + # Streams (will be set by service) + self.pack_stream = None + self.unpack_stream = None + self.send_stream = None + self.copy_stream = None + + self.torch_pack_stream = None + self.torch_unpack_stream = None + self.torch_copy_stream = None + + # Events for double-buffered synchronization + self.pack_events = [] + self.unpack_events = [] + + def set_streams( + self, + pack_stream, + unpack_stream, + send_stream, + copy_stream, + torch_pack_stream, + torch_unpack_stream, + torch_copy_stream, + ): + """Set CUDA streams for execution.""" + self.pack_stream = pack_stream + self.unpack_stream = unpack_stream + self.send_stream = send_stream + self.copy_stream = copy_stream + + self.torch_pack_stream = torch_pack_stream + self.torch_unpack_stream = torch_unpack_stream + self.torch_copy_stream = torch_copy_stream + + def set_events(self, pack_events: List, unpack_events: List): + """Set double-buffered CUDA events.""" + self.pack_events = pack_events + self.unpack_events = unpack_events + + def execute_pipeline( + self, iter_schedules: List[Dict[str, Optional[ScheduledBatch]]], num_iterations: int + ) -> None: + """ + Execute pipelined communication. + + Pipeline stages: + 1. Pack NEXT iteration (async) + 2. Unpack PRIOR iteration (async) + 3. Send CURRENT iteration (sync) + 4. Barrier + 5. Wait for async pack/unpack to complete + + Args: + iter_schedules: List of iteration schedules + num_iterations: Total number of iterations + """ + PELogger.info(f"Executing pipeline: {num_iterations} iterations") + + # Priming: Pack iteration 0 and WAIT for completion + if num_iterations > 0 and iter_schedules[0]["send"]: + torch.cuda.nvtx.range_push("Priming") + PELogger.debug("Priming: Packing iteration 0") + self._launch_pack(0, iter_schedules[0]["send"]) + self.pack_events[0].synchronize() + torch.cuda.nvtx.range_pop() + + for i in range(num_iterations): + torch.cuda.nvtx.range_push(f"Iteration {i}") + has_send = iter_schedules[i]["send"] is not None + has_recv = iter_schedules[i]["recv"] is not None + has_next_send = i + 1 < num_iterations and iter_schedules[i + 1]["send"] is not None + has_prior_recv = i > 0 and iter_schedules[i - 1]["recv"] is not None + + slot = i % 2 + + # Log iteration start + send_info = ( + f" → PE {iter_schedules[i]['send'].dest_pe} " + f"({iter_schedules[i]['send'].total_size} bytes)" + if has_send + else "" + ) + recv_info = ( + f" ← PE {iter_schedules[i]['recv'].src_pe} " + f"({iter_schedules[i]['recv'].total_size} bytes)" + if has_recv + else "" + ) + PELogger.debug(f"Iteration {i}/{num_iterations}: slot={slot}{send_info}{recv_info}") + + # Step 1: Pack NEXT iteration (async) + if has_next_send: + torch.cuda.nvtx.range_push("Step 1: Pack Next") + next_batch = iter_schedules[i + 1]["send"] + assert next_batch is not None + PELogger.debug( + f" Pack next (iter {i+1}): {len(next_batch.tasks)} tasks " + f"→ PE {next_batch.dest_pe}" + ) + self._launch_pack(i + 1, next_batch) + torch.cuda.nvtx.range_pop() + + # Step 2: Unpack PRIOR iteration (async) + if has_prior_recv: + torch.cuda.nvtx.range_push("Step 2: Unpack Prior") + prior_batch = iter_schedules[i - 1]["recv"] + assert prior_batch is not None + PELogger.debug( + f" Unpack prior (iter {i-1}): {prior_batch.total_size} bytes " + f"← PE {prior_batch.src_pe}" + ) + self._launch_unpack(i - 1, prior_batch) + torch.cuda.nvtx.range_pop() + + # Step 3: Send CURRENT iteration + if has_send: + torch.cuda.nvtx.range_push("Step 3: Send Current") + batch = iter_schedules[i]["send"] + assert batch is not None + transfer_size = batch.total_size + PELogger.debug(f" Send current: {transfer_size} bytes → PE {batch.dest_pe}") + + nvshmem.core.put( + self.buffer_manager.recv_slots[slot][0:transfer_size], + self.buffer_manager.send_slots[slot][0:transfer_size], + batch.dest_pe, + stream=self.send_stream, + ) + torch.cuda.nvtx.range_pop() + + # Ensure send completes + self.send_stream.sync() + nvshmem.core.quiet(stream=self.send_stream) + + # Step 4: Global barrier + torch.cuda.nvtx.range_push("Step 4: Barrier") + nvshmem.core.barrier_all(stream=self.send_stream) + self.send_stream.sync() + torch.cuda.nvtx.range_pop() + + # Step 5: Wait for async pack/unpack to complete + torch.cuda.nvtx.range_push("Step 5: Wait Async") + if has_prior_recv: + self.unpack_events[(i - 1) % 2].synchronize() + if has_next_send: + self.pack_events[(i + 1) % 2].synchronize() + torch.cuda.nvtx.range_pop() + + torch.cuda.nvtx.range_pop() + + # Final unpack for last iteration + if num_iterations > 0 and iter_schedules[num_iterations - 1]["recv"]: + torch.cuda.nvtx.range_push("Final Unpack") + PELogger.debug(f"Final unpack: iteration {num_iterations-1}") + last_recv = iter_schedules[num_iterations - 1]["recv"] + assert last_recv is not None + self._launch_unpack(num_iterations - 1, last_recv) + self.unpack_events[(num_iterations - 1) % 2].synchronize() + torch.cuda.nvtx.range_pop() + + PELogger.info(f"Pipeline complete: {num_iterations} iterations") + + def _launch_pack(self, iteration: int, batch: ScheduledBatch) -> None: + """Launch pack kernel for given iteration.""" + if not batch.gpu_plan: + return + + self.kernel_launcher.launch_pack( + batch.gpu_plan, + self.pack_stream, + self.torch_pack_stream, + self.pack_events[iteration % 2], + ) + + def _launch_unpack(self, iteration: int, batch: ScheduledBatch) -> None: + """Launch unpack kernel for given iteration.""" + if not batch.gpu_plan: + return + + self.kernel_launcher.launch_unpack( + batch.gpu_plan, + self.unpack_stream, + self.torch_unpack_stream, + self.unpack_events[iteration % 2], + ) + + def process_self_moves( + self, send_requests: List[SendRequest], receive_requests: List[ReceiveRequest] + ) -> None: + """ + Handle same-PE transfers (where src_pe == dest_pe == my_pe). + + Uses PyTorch copy on the copy stream for efficiency. + + Args: + send_requests: List of send requests + receive_requests: List of receive requests + """ + # Match send/recv requests where src_pe == dest_pe == my_pe + local_sends = {r.task_id: r for r in send_requests if r.dest_pe == self.my_pe} + local_recvs = [r for r in receive_requests if r.src_pe == self.my_pe] + + if local_recvs: + PELogger.debug(f"Processing {len(local_recvs)} self-moves") + + num_processed = 0 + with torch.cuda.stream(self.torch_copy_stream): + for recv_req in local_recvs: + if recv_req.task_id in local_sends: + send_req = local_sends[recv_req.task_id] + PELogger.debug( + " Self-move: task_id=%d, size=%d bytes", recv_req.task_id, send_req.size + ) + + # Create views of the tensors with offsets + src_view = send_req.src_tensor[ + send_req.src_pos : send_req.src_pos + send_req.size + ] + dest_view = recv_req.dest_tensor[ + recv_req.dest_pos : recv_req.dest_pos + send_req.size + ] + + # Async copy on the copy stream + dest_view.copy_(src_view, non_blocking=True) + num_processed += 1 + + # Synchronize the PyTorch stream + self.torch_copy_stream.synchronize() + + if num_processed > 0: + PELogger.info("Self-moves complete: %d transfers", num_processed) diff --git a/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu b/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu new file mode 100644 index 00000000000..e5b8fcc9a85 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu @@ -0,0 +1,103 @@ + +#include + +// CUDA-compatible types (no C++ standard library headers for NVRTC) +typedef unsigned char uint8_t; +typedef unsigned long long uint64_t; +typedef uint64_t uintptr_t; + +// ============================================================================ +// Kernel Configuration Constants (from ChunkedKernel.h) +// ============================================================================ + +constexpr int CHUNK_SIZE = 128 * 1024; // 128KB per chunk +constexpr int NUM_BLOCKS = 75; // Fixed grid size +constexpr int THREADS_PER_BLOCK = 1024; // Fixed block size +constexpr int FLOAT4_SIZE = 16; // 16 bytes per float4 +constexpr int MAX_CHUNKS_PER_BLOCK = 512; // Max chunks per block for shared memory + +extern "C" { + +/** + * Chunked batched copy kernel implementation + * + * This kernel performs efficient batched memory copies using: + * 1. Contiguous block assignment for better load balancing + * 2. Shared memory prefetching of chunk metadata + * 3. Vectorized float4 (16-byte) copies for aligned data + * 4. Byte-by-byte fallback for unaligned or small data + */ +__global__ void chunked_batched_copy_kernel( + uint8_t** src_addrs, + uint8_t** dst_addrs, + size_t* sizes, + int total_chunks +) { + // Shared memory for metadata prefetching + __shared__ uint8_t* s_src_addrs[MAX_CHUNKS_PER_BLOCK]; + __shared__ uint8_t* s_dst_addrs[MAX_CHUNKS_PER_BLOCK]; + __shared__ size_t s_sizes[MAX_CHUNKS_PER_BLOCK]; + + // Contiguous block assignment: block i processes chunks [start_chunk, end_chunk) + int chunks_per_block = (total_chunks + gridDim.x - 1) / gridDim.x; // Ceiling division + int start_chunk = blockIdx.x * chunks_per_block; + int end_chunk = start_chunk + chunks_per_block; + if (end_chunk > total_chunks) { + end_chunk = total_chunks; + } + int num_chunks_this_block = end_chunk - start_chunk; + + // Phase 1: Cooperative loading of metadata to shared memory + // All 1024 threads cooperate to load metadata from global memory + for (int i = threadIdx.x; i < num_chunks_this_block; i += blockDim.x) { + int global_chunk_id = start_chunk + i; + s_src_addrs[i] = src_addrs[global_chunk_id]; + s_dst_addrs[i] = dst_addrs[global_chunk_id]; + s_sizes[i] = sizes[global_chunk_id]; + } + __syncthreads(); + + // Phase 2: Process each chunk using metadata from shared memory + for (int chunk_id = 0; chunk_id < num_chunks_this_block; chunk_id++) { + uint8_t* src = s_src_addrs[chunk_id]; + uint8_t* dst = s_dst_addrs[chunk_id]; + size_t size = s_sizes[chunk_id]; + + // Check if both src and dst are aligned to 16 bytes for float4 access + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + bool is_aligned = ((src_addr % FLOAT4_SIZE) == 0) && ((dst_addr % FLOAT4_SIZE) == 0); + + if (is_aligned && size >= FLOAT4_SIZE) { + // Fast path: vectorized float4 copies + size_t aligned_size = (size / FLOAT4_SIZE) * FLOAT4_SIZE; + + // All 1024 threads cooperate on float4 copies + #pragma unroll 4 + for (size_t offset = threadIdx.x * FLOAT4_SIZE; + offset < aligned_size; + offset += blockDim.x * FLOAT4_SIZE) { + // Vectorized 16-byte load and store + float4 data = *((float4*)(src + offset)); + *((float4*)(dst + offset)) = data; + } + + // Handle remaining bytes (< 16 bytes) with byte-by-byte copy + for (size_t offset = aligned_size + threadIdx.x; + offset < size; + offset += blockDim.x) { + dst[offset] = src[offset]; + } + } else { + // Fallback path: byte-by-byte copy for unaligned addresses + // Still use all threads for parallelism + for (size_t offset = threadIdx.x; offset < size; offset += blockDim.x) { + dst[offset] = src[offset]; + } + } + } +} + +} + + diff --git a/megatron/core/resharding/nvshmem_copy_service/logger.py b/megatron/core/resharding/nvshmem_copy_service/logger.py new file mode 100644 index 00000000000..a3c7c1699ad --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/logger.py @@ -0,0 +1,209 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" + +Per-PE Logger with colored console and file output. + + + +Similar to the C++ Logger implementation, provides: + +- Per-PE colored console output + +- Per-PE file logging + +- Support for TRACE, DEBUG, INFO, SUMMARY, WARN, ERROR levels + +""" + +import logging +import os +from datetime import datetime +from typing import Optional + + +class ColoredFormatter(logging.Formatter): + """Custom formatter that adds color codes for console output.""" + + def __init__(self, fmt: str, pe_id: int, use_color: bool = True): + super().__init__(fmt) + self.pe_id = pe_id + self.use_color = use_color + + # ANSI color codes matching C++ implementation + self.colors = { + 0: "\033[31m", # Red + 1: "\033[32m", # Green + 2: "\033[33m", # Yellow + 3: "\033[34m", # Blue + 4: "\033[35m", # Magenta + 5: "\033[36m", # Cyan + 6: "\033[91m", # Bright Red + 7: "\033[92m", # Bright Green + } + self.reset = "\033[0m" + + def formatTime(self, record, datefmt=None): + ct = self.converter(record.created) + if datefmt: + s = datetime.fromtimestamp(record.created).strftime(datefmt) + # For file logs, replace %f with milliseconds + if "%f" in datefmt: + s = s.replace("%f", f"{int(record.msecs):03d}") + else: + s = datetime.fromtimestamp(record.created).strftime("%H:%M:%S") + s = f"{s}.{int(record.msecs):03d}" + return s + + def format(self, record): + # Save original message + original_msg = record.msg + + if self.use_color and self.pe_id >= 0: + color = self.colors.get(self.pe_id, "\033[37m") # White for others + record.msg = f"{color}{record.msg}{self.reset}" + + result = super().format(record) + + # Restore original message for other handlers + record.msg = original_msg + + return result + + +class PELogger: + """Per-PE logger with colored console and file output.""" + + _logger: Optional[logging.Logger] = None + _pe_id: int = -1 + _level: int = logging.INFO + + @classmethod + def init(cls, pe_id: int, level: str = "INFO", logs_dir: str = "logs"): + """ + Initialize logger for this PE. + + Args: + pe_id: Process element ID + level: Log level (TRACE, DEBUG, INFO, WARN, ERROR) + logs_dir: Directory for log files + """ + cls._pe_id = pe_id + + # Convert level string to logging level + level_map = { + "TRACE": logging.DEBUG - 5, # Custom level below DEBUG + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "SUMMARY": logging.INFO, + "WARN": logging.WARNING, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, + "CRITICAL": logging.CRITICAL, + } + cls._level = level_map.get(level.upper(), logging.INFO) + + # Create logs directory if it doesn't exist + os.makedirs(logs_dir, exist_ok=True) + + # Create logger + logger_name = f"PE_{pe_id}" + cls._logger = logging.getLogger(logger_name) + cls._logger.setLevel(cls._level) + cls._logger.propagate = False + + # Remove existing handlers to avoid duplicates + cls._logger.handlers.clear() + + # 1. Console handler with color + console_handler = logging.StreamHandler() + console_handler.setLevel(cls._level) + console_format = "[PE %d] [%%(asctime)s] [%%(levelname)s] %%(message)s" % pe_id + console_formatter = ColoredFormatter(console_format, pe_id, use_color=True) + console_handler.setFormatter(console_formatter) + cls._logger.addHandler(console_handler) + + # 2. File handler without color + log_filename = os.path.join(logs_dir, f"pe_{pe_id}.log") + file_handler = logging.FileHandler(log_filename, mode="w") + file_handler.setLevel(cls._level) + file_format = "[PE %d] [%%(asctime)s] [%%(levelname)s] %%(message)s" % pe_id + file_formatter = ColoredFormatter(file_format, pe_id, use_color=False) + file_handler.setFormatter(file_formatter) + cls._logger.addHandler(file_handler) + + @classmethod + def set_level(cls, level: str): + """Set the logging level.""" + level_map = { + "TRACE": logging.DEBUG - 5, + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "SUMMARY": logging.INFO, + "WARN": logging.WARNING, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, + "CRITICAL": logging.CRITICAL, + } + cls._level = level_map.get(level.upper(), logging.INFO) + if cls._logger: + cls._logger.setLevel(cls._level) + for handler in cls._logger.handlers: + handler.setLevel(cls._level) + + @classmethod + def trace(cls, msg: str): + """Log at TRACE level (most detailed).""" + if cls._logger: + cls._logger.log(logging.DEBUG - 5, msg) + + @classmethod + def debug(cls, msg: str): + """Log at DEBUG level.""" + if cls._logger: + cls._logger.debug(msg) + + @classmethod + def info(cls, msg: str): + """Log at INFO level.""" + if cls._logger: + cls._logger.info(msg) + + @classmethod + def summary(cls, msg: str): + """Log summary information (INFO level with [SUMMARY] prefix).""" + if cls._logger: + cls._logger.info(f"[SUMMARY] {msg}") + + @classmethod + def warn(cls, msg: str): + """Log at WARNING level.""" + if cls._logger: + cls._logger.warning(msg) + + @classmethod + def warning(cls, msg: str): + """Log at WARNING level (alias for warn).""" + cls.warn(msg) + + @classmethod + def error(cls, msg: str): + """Log at ERROR level.""" + if cls._logger: + cls._logger.error(msg) + + @classmethod + def critical(cls, msg: str): + """Log at CRITICAL level.""" + if cls._logger: + cls._logger.critical(msg) + + @classmethod + def shutdown(cls): + """Shutdown the logger and flush all handlers.""" + if cls._logger: + for handler in cls._logger.handlers: + handler.flush() + handler.close() + cls._logger.handlers.clear() + cls._logger = None diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py b/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py new file mode 100644 index 00000000000..5cd8aac704b --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Memory management utilities for NVSHMEM operations.""" + +from .double_buffer_manager import DoubleBufferManager +from .tensor_pointer_utils import TensorPointerExtractor + +__all__ = ["DoubleBufferManager", "TensorPointerExtractor"] diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py b/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py new file mode 100644 index 00000000000..079b2c17610 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Double buffer management for NVSHMEM symmetric memory. + +Manages send and receive buffers with double-buffering for pipelined communication. +""" + +try: + import nvshmem.core.interop.torch + + HAVE_NVSHMEM = True +except ImportError: + HAVE_NVSHMEM = False + +import torch + +from ..nvshmem_types import MAX_SEGMENT_SIZE + + +class DoubleBufferManager: + """Manages double-buffered NVSHMEM symmetric buffers for send/receive operations.""" + + def __init__(self, slot_size: int = MAX_SEGMENT_SIZE): + """ + Initialize buffer manager. + + Args: + slot_size: Size of each buffer slot in bytes (default: 256MB) + """ + self.slot_size = slot_size + self.send_slots = [None, None] + self.recv_slots = [None, None] + + def allocate(self) -> None: + """Allocate NVSHMEM symmetric buffers for double-buffering.""" + if not HAVE_NVSHMEM: + raise RuntimeError( + "nvshmem.core.interop.torch is not available. " + "Please install nvshmem to use DoubleBufferManager." + ) + + for i in range(2): + self.send_slots[i] = nvshmem.core.interop.torch.bytetensor( + (self.slot_size,), dtype=torch.uint8 + ) + self.recv_slots[i] = nvshmem.core.interop.torch.bytetensor( + (self.slot_size,), dtype=torch.uint8 + ) + # Zero out buffers + self.send_slots[i].zero_() + self.recv_slots[i].zero_() + + def get_send_slot(self, iteration: int): + """ + Get send buffer for given iteration. + + Args: + iteration: Iteration number + + Returns: + NVSHMEM tensor for sending + """ + return self.send_slots[iteration % 2] + + def get_recv_slot(self, iteration: int): + """ + Get receive buffer for given iteration. + + Args: + iteration: Iteration number + + Returns: + NVSHMEM tensor for receiving + """ + return self.recv_slots[iteration % 2] + + def free(self) -> None: + """Free NVSHMEM symmetric buffers.""" + for i in range(2): + if self.send_slots[i] is not None: + nvshmem.core.interop.torch.free_tensor(self.send_slots[i]) + self.send_slots[i] = None + if self.recv_slots[i] is not None: + nvshmem.core.interop.torch.free_tensor(self.recv_slots[i]) + self.recv_slots[i] = None diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py b/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py new file mode 100644 index 00000000000..ee250618ee7 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Utilities for extracting data pointers from different tensor types. + +Supports PyTorch tensors, CuPy arrays, and raw integer pointers. +""" + +from typing import Any + +import torch + + +class TensorPointerExtractor: + """Extract memory pointers from various tensor types.""" + + @staticmethod + def get_pointer(tensor: Any) -> int: + """ + Extract the data pointer from a tensor. + + Args: + tensor: Can be torch.Tensor, CuPy array, or raw int pointer + + Returns: + int: Memory address of the tensor data + + Examples: + + >>> import torch + + >>> t = torch.zeros(100, device='cuda') + + >>> ptr = TensorPointerExtractor.get_pointer(t) + + >>> isinstance(ptr, int) + + True + """ + if isinstance(tensor, torch.Tensor): + return tensor.data_ptr() + elif hasattr(tensor, "data"): # CuPy array + return tensor.data.ptr + else: # Assume raw integer pointer + return tensor diff --git a/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py b/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py new file mode 100644 index 00000000000..731cace0502 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass, field +from typing import Any, List + +# Constants +MAX_SEGMENT_SIZE = 256 * 1024 * 1024 # 256MB +MAX_TASKS_PER_BATCH = 10000 + + +@dataclass +class SendRequest: + """Container for a send operation request.""" + + task_id: int + src_tensor: Any # cupy.ndarray or pointer + src_pos: int + size: int + dest_pe: int + + +@dataclass +class ReceiveRequest: + """Container for a receive operation request.""" + + task_id: int + dest_tensor: Any # cupy.ndarray or pointer + dest_pos: int + size: int + src_pe: int + + +@dataclass +class WorkloadGroup: + """Container for a group of send requests to a specific destination PE.""" + + dest_pe: int + tasks: List[SendRequest] = field(default_factory=list) + total_size: int = 0 + + +@dataclass +class ScheduledBatch: + """Metadata for a scheduled communication batch.""" + + src_pe: int + dest_pe: int + batch_index: int + iteration: int + # Metadata for GPU execution + gpu_plan: Any = None # Placeholder for GPU-resident plan + tasks: List[SendRequest] = field(default_factory=list) + total_size: int = 0 + tasks_summary: Any = None # WorkloadSummary + + +@dataclass +class WorkloadSummary: + """Summary of a workload group for communication with other PEs.""" + + total_size: int + task_ids: List[int] + task_sizes: List[int] + + +@dataclass +class TransferMetadata: + """GPU-resident metadata for communication tasks.""" + + ptrs: Any # cupy array of uint64 (pointers) + sizes: Any # cupy array of uint64 (sizes) + num_tasks: int + total_size: int diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py b/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py new file mode 100644 index 00000000000..9df0b3ac318 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Planning components for task segmentation, workload packing, and scheduling.""" + +from .communication_scheduler import CommunicationScheduler +from .gpu_execution_planner import GPUExecutionPlanner +from .task_segmenter import TaskSegmenter +from .workload_packer import WorkloadPacker + +__all__ = ["CommunicationScheduler", "GPUExecutionPlanner", "TaskSegmenter", "WorkloadPacker"] diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py new file mode 100644 index 00000000000..0f299a84e40 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py @@ -0,0 +1,181 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from typing import Dict, List, Tuple + +from ..logger import PELogger +from ..nvshmem_types import ScheduledBatch, WorkloadGroup, WorkloadSummary + + +class CommunicationScheduler: + """ + Builds a conflict-free, iteration-based schedule for communication. + Ensures that in any given iteration, a PE is not overloaded. + """ + + def __init__(self): + self.num_iterations = 0 + + def build_schedule( + self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int + ) -> Tuple[Dict[int, List[ScheduledBatch]], Dict[Tuple[int, int, int], WorkloadSummary]]: + """ + Main scheduling method. + 1. Exchanges workload info with other PEs. + 2. Assigns batches to iterations. + 3. Returns: + - local schedule (iteration -> list of batches) + - global workload summaries (key: (src, dest, batch_idx) -> summary) + """ + total_local_batches = sum(len(groups) for groups in workloads.values()) + PELogger.info(f"Building schedule: {total_local_batches} local batches, {n_pes} PEs") + + # Step 1: Collect all batches across all PE pairs + PELogger.debug("Collecting batches from all PEs...") + all_batches = self._collect_all_batches(workloads, my_pe, n_pes) + PELogger.debug(f"Collected {len(all_batches)} total batches globally") + + # Step 2: Assign batches to iterations using conflict-free algorithm + PELogger.debug("Assigning batches to iterations...") + self._assign_iterations(all_batches) + PELogger.info(f"Schedule built: {self.num_iterations} iterations") + + # Step 3: Exchange detailed workload summaries (Task IDs/Sizes) + # This is needed for receivers to know what tasks are in each batch + PELogger.debug("Exchanging workload summaries...") + global_summaries = self._exchange_workload_summaries(workloads, my_pe, n_pes) + PELogger.debug(f"Exchanged {len(global_summaries)} workload summaries") + + # Step 4: Build schedule map for this PE + my_batches = [b for b in all_batches if b.src_pe == my_pe or b.dest_pe == my_pe] + my_batches.sort(key=lambda x: x.iteration) + + final_schedule: Dict[int, List[ScheduledBatch]] = {} + for b in my_batches: + final_schedule.setdefault(b.iteration, []).append(b) + + return final_schedule, global_summaries + + def _collect_all_batches( + self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int + ) -> List[ScheduledBatch]: + """ + Exchanges batch counts and details with all PEs to build a global view. + Uses torch.distributed for reliable communication. + """ + import torch.distributed as dist + + # Build local batch list + local_batches: List[Tuple[int, int, int]] = [] + for dest_pe, groups in workloads.items(): + if dest_pe == my_pe: + continue + for i, _ in enumerate(groups): + local_batches.append((my_pe, dest_pe, i)) # (src, dest, batch_idx) + + PELogger.debug(f" Local batch count: {len(local_batches)}") + PELogger.debug(f" Local batches: {local_batches}") + + # Gather all batches from all PEs using torch.distributed + all_batches_list: List[List[Tuple[int, int, int]] | None] = [None] * n_pes + dist.all_gather_object(all_batches_list, local_batches) + + # Flatten into global batch list + global_batches: List[ScheduledBatch] = [] + for pe_batches in all_batches_list: + if pe_batches is None: + continue + for src, dest, idx in pe_batches: + global_batches.append( + ScheduledBatch(src_pe=src, dest_pe=dest, batch_index=idx, iteration=-1) + ) + + PELogger.debug(f" Global batches collected: {len(global_batches)} total") + + # Group by source for readability + batches_by_src: Dict[int, List[Tuple[int, int]]] = {} + for b in global_batches: + batches_by_src.setdefault(b.src_pe, []).append((b.dest_pe, b.batch_index)) + for src_pe in sorted(batches_by_src.keys()): + PELogger.debug(f" PE {src_pe} sends to: {batches_by_src[src_pe]}") + + return global_batches + + def _assign_iterations(self, batches: List[ScheduledBatch]): + self.num_iterations = 0 + batches.sort(key=lambda x: (x.src_pe, x.dest_pe, x.batch_index)) + + for batch in batches: + iteration = 0 + assigned = False + while not assigned: + if not self._has_conflict(batch, iteration, batches): + batch.iteration = iteration + self.num_iterations = max(self.num_iterations, iteration + 1) + assigned = True + PELogger.debug( + f" Assigned batch ({batch.src_pe} → {batch.dest_pe}, " + f"idx={batch.batch_index}) to iteration {iteration}" + ) + else: + iteration += 1 + + def _has_conflict( + self, batch: ScheduledBatch, iteration: int, all_batches: List[ScheduledBatch] + ) -> bool: + for other in all_batches: + if other.iteration == iteration and other is not batch: + if other.src_pe == batch.src_pe or other.dest_pe == batch.dest_pe: + return True + return False + + def _exchange_workload_summaries( + self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int + ) -> Dict[Tuple[int, int, int], WorkloadSummary]: + """ + Exchange detailed workload content using torch.distributed. + Simple and reliable - no NVSHMEM symmetric memory issues. + """ + import torch.distributed as dist + + # Build local summaries as a simple dict: + # (src, dest, batch_idx) -> {total_size, task_ids, task_sizes} + local_summaries: Dict[Tuple[int, int, int], Dict[str, object]] = {} + batch_count = 0 + total_tasks = 0 + + for dest_pe, groups in workloads.items(): + if dest_pe == my_pe: + continue + for batch_idx, group in enumerate(groups): + key = (my_pe, dest_pe, batch_idx) + local_summaries[key] = { + "total_size": group.total_size, + "task_ids": [t.task_id for t in group.tasks], + "task_sizes": [t.size for t in group.tasks], + } + batch_count += 1 + total_tasks += len(group.tasks) + + PELogger.debug(f" Local summaries: {batch_count} batches, {total_tasks} tasks") + + # Gather all summaries from all PEs using torch.distributed + all_summaries_list: List[Dict[Tuple[int, int, int], Dict[str, object]] | None] = [ + None + ] * n_pes + dist.all_gather_object(all_summaries_list, local_summaries) + + # Merge into global map + global_map: Dict[Tuple[int, int, int], WorkloadSummary] = {} + for pe_summaries in all_summaries_list: + if pe_summaries is None: + continue + for key, data in pe_summaries.items(): + summary = WorkloadSummary( + total_size=int(data["total_size"]), + task_ids=list(data["task_ids"]), + task_sizes=list(data["task_sizes"]), + ) + global_map[key] = summary + + PELogger.debug(f" Exchanged {len(global_map)} workload summaries") + return global_map diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py b/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py new file mode 100644 index 00000000000..68c4d11d7e5 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py @@ -0,0 +1,222 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +GPU execution planning for pack/unpack operations. + +Converts high-level task descriptions into GPU-ready metadata +(pointer arrays, sizes, chunking) for kernel execution. +""" + +from typing import Dict, List, Optional, Tuple + +try: + import cupy as cp + + HAVE_CUPY = True +except ImportError: + HAVE_CUPY = False + +import torch + +from ..logger import PELogger +from ..memory.tensor_pointer_utils import TensorPointerExtractor +from ..nvshmem_types import ReceiveRequest, ScheduledBatch + + +class GPUExecutionPlanner: + """Plans GPU kernel execution by building pointer arrays and metadata.""" + + def __init__(self): + self.tensor_utils = TensorPointerExtractor() + self.CHUNK_SIZE = 128 * 1024 # 128KB chunks + + def create_gpu_plans( + self, + iter_schedules: List[Dict[str, Optional[ScheduledBatch]]], + send_slots: List, + recv_slots: List, + receive_requests: List[ReceiveRequest], + ) -> None: + """ + Build GPU execution plans for all iterations. + + Modifies iter_schedules in-place by adding gpu_plan to each batch. + + Args: + iter_schedules: List of iteration schedules (dicts with 'send' and 'recv') + send_slots: List of send buffer slots + recv_slots: List of receive buffer slots + receive_requests: List of all receive requests for matching + """ + if not HAVE_CUPY: + raise RuntimeError( + "cupy is not available. Please install cupy to use GPUExecutionPlanner." + ) + + PELogger.debug(f"Creating GPU plans for {len(iter_schedules)} iterations") + for i, sched in enumerate(iter_schedules): + send_batch = sched["send"] + if send_batch: + # Build Pack Metadata + ptrs: List[int] = [] + positions: List[int] = [] + sizes: List[int] = [] + + for t in send_batch.tasks: + # Extract pointer from tensor + ptr = self.tensor_utils.get_pointer(t.src_tensor) + ptrs.append(ptr) + positions.append(t.src_pos) + sizes.append(t.size) + + # Plan kernel args for packing + send_batch.gpu_plan = self._plan_kernel_args( + ptrs, positions, sizes, is_pack=True, buffer_base=send_slots[i % 2].data_ptr() + ) + task_ids = [t.task_id for t in send_batch.tasks] + PELogger.debug( + f" Iter {i} send plan: {len(send_batch.tasks)} tasks → " + f"PE {send_batch.dest_pe}, {send_batch.total_size} bytes" + ) + displayed_ids = task_ids[:10] if len(task_ids) <= 10 else task_ids[:10] + ["..."] + PELogger.debug(f" Send task IDs: {displayed_ids}") + + recv_batch = sched["recv"] + if recv_batch: + # Build Unpack Metadata + summary = recv_batch.tasks_summary + + # Skip if no summary available (shouldn't happen in normal operation) + if summary is None: + PELogger.error( + f"Iter {i}: recv batch from PE {recv_batch.src_pe} has no " + "tasks_summary - UNPACK WILL BE SKIPPED!" + ) + recv_batch.gpu_plan = None + continue + + PELogger.debug( + f" Iter {i} recv from PE {recv_batch.src_pe}: " + f"{len(summary.task_ids)} tasks in summary" + ) + + ptrs = [] + positions = [] + sizes = [] + + # Create fast lookup map for receive requests + relevant_reqs: Dict[int, ReceiveRequest] = { + r.task_id: r for r in receive_requests if r.src_pe == recv_batch.src_pe + } + + # Match summary tasks with receive requests + matched_task_ids: List[int] = [] + unmatched_task_ids: List[int] = [] + for t_id, t_size in zip(summary.task_ids, summary.task_sizes): + if t_id in relevant_reqs: + req = relevant_reqs[t_id] + ptr = self.tensor_utils.get_pointer(req.dest_tensor) + ptrs.append(ptr) + positions.append(req.dest_pos) + sizes.append(t_size) # Use sender's size + matched_task_ids.append(t_id) + else: + unmatched_task_ids.append(t_id) + PELogger.error( + f"Iter {i}: Unexpected task {t_id} from PE " + f"{recv_batch.src_pe} - no matching recv request!" + ) + + if unmatched_task_ids: + PELogger.error( + f" Iter {i}: {len(unmatched_task_ids)} unmatched tasks " + f"from PE {recv_batch.src_pe}: {unmatched_task_ids[:10]}" + ) + + # Plan kernel args for unpacking + recv_batch.gpu_plan = self._plan_kernel_args( + ptrs, positions, sizes, is_pack=False, buffer_base=recv_slots[i % 2].data_ptr() + ) + + if recv_batch.gpu_plan is None: + PELogger.error( + f" Iter {i} recv plan: FAILED - no gpu_plan created for " + f"{len(sizes)} tasks from PE {recv_batch.src_pe}" + ) + else: + PELogger.debug( + f" Iter {i} recv plan: {len(sizes)} tasks ← " + f"PE {recv_batch.src_pe}, {recv_batch.total_size} bytes" + ) + displayed_recv_ids = ( + matched_task_ids[:10] + if len(matched_task_ids) <= 10 + else matched_task_ids[:10] + ["..."] + ) + PELogger.debug(f" Recv task IDs: {displayed_recv_ids}") + + def _plan_kernel_args( + self, + ptrs: List[int], + positions: List[int], + sizes: List[int], + is_pack: bool, + buffer_base: int, + ) -> Optional[Tuple[object, object, object, int]]: + """ + Generate GPU-ready pointer arrays for kernel execution. + + Applies 128KB chunking to break large transfers into smaller pieces. + + Args: + ptrs: List of tensor data pointers + positions: List of positions within tensors + sizes: List of transfer sizes + is_pack: True for pack (user->buffer), False for unpack (buffer->user) + buffer_base: Base pointer of the buffer + + Returns: + Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks) as + CuPy arrays, or None if no work. + """ + h_src_addrs: List[int] = [] + h_dst_addrs: List[int] = [] + h_sizes: List[int] = [] + + packed_offset = 0 + + for ptr, pos, size in zip(ptrs, positions, sizes): + num_chunks = (size + self.CHUNK_SIZE - 1) // self.CHUNK_SIZE + + for c in range(num_chunks): + chunk_offset = c * self.CHUNK_SIZE + chunk_size = min(self.CHUNK_SIZE, size - chunk_offset) + + if is_pack: + # Pack: user tensor -> buffer + h_src_addrs.append(ptr + pos + chunk_offset) + h_dst_addrs.append(buffer_base + packed_offset + chunk_offset) + else: + # Unpack: buffer -> user tensor + h_src_addrs.append(buffer_base + packed_offset + chunk_offset) + h_dst_addrs.append(ptr + pos + chunk_offset) + + h_sizes.append(chunk_size) + + packed_offset += size + + total_chunks = len(h_sizes) + if total_chunks == 0: + return None + + # Move to GPU using PyTorch, then convert to CuPy for kernel launching + d_src_addrs = torch.tensor(h_src_addrs, dtype=torch.int64, device="cuda") + d_dst_addrs = torch.tensor(h_dst_addrs, dtype=torch.int64, device="cuda") + d_sizes = torch.tensor(h_sizes, dtype=torch.int64, device="cuda") + + # Convert to CuPy arrays (zero-copy) for kernel launching + cp_src_addrs = cp.asarray(d_src_addrs) + cp_dst_addrs = cp.asarray(d_dst_addrs) + cp_sizes = cp.asarray(d_sizes) + + return (cp_src_addrs, cp_dst_addrs, cp_sizes, total_chunks) diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py b/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py new file mode 100644 index 00000000000..fdeaea33ae5 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import logging +from typing import List + +from ..nvshmem_types import MAX_SEGMENT_SIZE, ReceiveRequest, SendRequest + +logger = logging.getLogger(__name__) + +# Constants for ID encoding (from C++ implementation) +REQUEST_ID_BASE = 1000000000 +SEGMENT_ID_MULTIPLIER = 1000 +MAX_REQUESTS = 1000000 +MAX_SEGMENTS_PER_REQUEST = 1000 + + +class TaskSegmenter: + """ + Splits large tasks (>256MB) into smaller segments to fit + into the fixed-size communication slots. + """ + + def _encode_segment_id(self, task_id: int, segment_index: int) -> int: + return REQUEST_ID_BASE + (task_id * SEGMENT_ID_MULTIPLIER) + segment_index + + def _calculate_num_segments(self, size: int) -> int: + return (size + MAX_SEGMENT_SIZE - 1) // MAX_SEGMENT_SIZE + + def _validate_segmentation(self, task_id: int, size: int) -> bool: + num_segments = self._calculate_num_segments(size) + if num_segments > MAX_SEGMENTS_PER_REQUEST: + logger.error( + f"Error: Task {task_id} requires {num_segments} segments, " + f"exceeds max {MAX_SEGMENTS_PER_REQUEST}" + ) + return False + if task_id >= MAX_REQUESTS: + logger.error(f"Error: Task ID {task_id} exceeds max {MAX_REQUESTS}") + return False + return True + + def segment_send_request(self, req: SendRequest) -> List[SendRequest]: + """ + Splits a single send request into multiple segments + if larger than MAX_SEGMENT_SIZE. + """ + if req.size <= MAX_SEGMENT_SIZE: + return [req] + + if not self._validate_segmentation(req.task_id, req.size): + raise ValueError(f"Task {req.task_id} validation failed") + + num_segments = self._calculate_num_segments(req.size) + output_requests: List[SendRequest] = [] + + for i in range(num_segments): + segment_offset = i * MAX_SEGMENT_SIZE + segment_size = min(MAX_SEGMENT_SIZE, req.size - segment_offset) + segment_task_id = self._encode_segment_id(req.task_id, i) + + new_req = SendRequest( + task_id=segment_task_id, + src_tensor=req.src_tensor, + src_pos=req.src_pos + segment_offset, + size=segment_size, + dest_pe=req.dest_pe, + ) + output_requests.append(new_req) + + return output_requests + + def segment_receive_request(self, req: ReceiveRequest) -> List[ReceiveRequest]: + """ + Splits a single receive request into multiple segments + if larger than MAX_SEGMENT_SIZE. + """ + if req.size <= MAX_SEGMENT_SIZE: + return [req] + + if not self._validate_segmentation(req.task_id, req.size): + raise ValueError(f"Task {req.task_id} validation failed") + + num_segments = self._calculate_num_segments(req.size) + output_requests: List[ReceiveRequest] = [] + + for i in range(num_segments): + segment_offset = i * MAX_SEGMENT_SIZE + segment_size = min(MAX_SEGMENT_SIZE, req.size - segment_offset) + segment_task_id = self._encode_segment_id(req.task_id, i) + + new_req = ReceiveRequest( + task_id=segment_task_id, + dest_tensor=req.dest_tensor, + dest_pos=req.dest_pos + segment_offset, + size=segment_size, + src_pe=req.src_pe, + ) + output_requests.append(new_req) + + return output_requests diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py b/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py new file mode 100644 index 00000000000..1f2374bc187 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from typing import Dict, List + +from ..logger import PELogger +from ..nvshmem_types import MAX_SEGMENT_SIZE, MAX_TASKS_PER_BATCH, SendRequest, WorkloadGroup + + +class WorkloadPacker: + """ + Packs individual SendRequests into WorkloadGroups (batches) + destined for the same PE, respecting size limits. + """ + + def pack_workloads( + self, send_requests: List[SendRequest], n_pes: int + ) -> Dict[int, List[WorkloadGroup]]: + """ + Groups requests by destination PE and packs them into batches. + Returns a map: dest_pe -> list of batches + """ + PELogger.debug(f"Packing {len(send_requests)} send requests for {n_pes} PEs") + workloads: Dict[int, List[WorkloadGroup]] = {} + + # Group requests by destination PE + tasks_by_dest: Dict[int, List[SendRequest]] = {} + for req in send_requests: + tasks_by_dest.setdefault(req.dest_pe, []).append(req) + + # Pack tasks for each destination + for dest_pe in range(n_pes): + if dest_pe not in tasks_by_dest: + workloads[dest_pe] = [] + PELogger.debug(f" Dest PE {dest_pe}: 0 tasks → 0 batches") + continue + + tasks = tasks_by_dest[dest_pe] + workloads[dest_pe] = self._pack_single_destination(tasks, dest_pe) + + if workloads[dest_pe]: + total_size = sum(b.total_size for b in workloads[dest_pe]) + PELogger.debug( + f" Dest PE {dest_pe}: {len(tasks)} tasks → " + f"{len(workloads[dest_pe])} batches, {total_size} bytes total" + ) + else: + PELogger.debug( + f" Dest PE {dest_pe}: {len(tasks)} tasks → 0 batches (empty after packing)" + ) + + return workloads + + def _pack_single_destination( + self, tasks: List[SendRequest], dest_pe: int + ) -> List[WorkloadGroup]: + if not tasks: + return [] + + # Sort tasks by size (descending) for better bin packing efficiency + tasks.sort(key=lambda x: x.size, reverse=True) + + batches: List[WorkloadGroup] = [] + current_batch = WorkloadGroup(dest_pe=dest_pe, tasks=[], total_size=0) + + for task in tasks: + # Check if adding this task would exceed batch constraints + would_exceed_size = current_batch.total_size + task.size > MAX_SEGMENT_SIZE + would_exceed_task_cap = len(current_batch.tasks) >= MAX_TASKS_PER_BATCH + + if (would_exceed_size or would_exceed_task_cap) and current_batch.tasks: + # Finalize current batch + batches.append(current_batch) + task_first_10_string = ", ".join([str(t.task_id) for t in current_batch.tasks[:10]]) + PELogger.debug( + f" Packed batch to PE {dest_pe} idx {len(batches) - 1}: " + f"{task_first_10_string}... (total {len(current_batch.tasks)} tasks)" + ) + # Start new batch + current_batch = WorkloadGroup(dest_pe=dest_pe, tasks=[], total_size=0) + + # Add task to current batch + current_batch.tasks.append(task) + current_batch.total_size += task.size + + # Add final batch if not empty + if current_batch.tasks: + batches.append(current_batch) + + return batches diff --git a/megatron/core/resharding/nvshmem_copy_service/service.py b/megatron/core/resharding/nvshmem_copy_service/service.py new file mode 100644 index 00000000000..631e63ae41b --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/service.py @@ -0,0 +1,408 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Remote Copy Service - Main orchestrator for NVSHMEM-based GPU-to-GPU transfers. + +This service coordinates task segmentation, workload packing, scheduling, + +GPU resource management, and pipelined execution. +""" + +from typing import Dict, List, Optional, Tuple + +try: + import nvshmem.core + + HAVE_NVSHMEM = True +except ImportError: + HAVE_NVSHMEM = False + +import torch.cuda.nvtx as nvtx + +from .core import GPUResourceManager, KernelLauncher, PipelineExecutor +from .logger import PELogger +from .memory import DoubleBufferManager +from .nvshmem_types import ReceiveRequest, ScheduledBatch, SendRequest, WorkloadSummary +from .planning import CommunicationScheduler, GPUExecutionPlanner, TaskSegmenter, WorkloadPacker + + +class RemoteCopyService: + """ + Main service for managing remote GPU-to-GPU data transfers. + + Provides high-level API for registering transfers, scheduling, + and executing pipelined communication with NVSHMEM. + """ + + def __init__(self): + # Core components + self.gpu_resources = GPUResourceManager() + self.buffer_manager = DoubleBufferManager() + self.kernel_launcher = KernelLauncher() + self.pipeline_executor = None # Created after init + + # Planning components + self.task_segmenter = TaskSegmenter() + self.workload_packer = WorkloadPacker() + self.comm_scheduler = CommunicationScheduler() + self.gpu_planner = GPUExecutionPlanner() + + # State + self.send_requests: List[SendRequest] = [] + self.receive_requests: List[ReceiveRequest] = [] + self.iter_schedules: Optional[List[Dict]] = None + self.num_iterations: int = 0 + + # Events for double-buffering + self.pack_events = [] + self.unpack_events = [] + + @property + def my_pe(self) -> int: + """Get this PE's rank.""" + return self.gpu_resources.my_pe + + @property + def n_pes(self) -> int: + """Get total number of PEs.""" + return self.gpu_resources.n_pes + + @property + def device(self): + """Get CUDA device.""" + return self.gpu_resources.device + + @property + def initialized(self) -> bool: + """Check if service is initialized.""" + return self.gpu_resources.initialized + + def init(self, log_level: str = "INFO") -> None: + """ + Initialize the service. + + Sets up NVSHMEM, CUDA device, streams, buffers, and kernels. + Expects to be launched with torchrun. + + Args: + log_level: Logging level (TRACE, DEBUG, INFO, WARN, ERROR) + """ + if not HAVE_NVSHMEM: + raise RuntimeError( + "nvshmem.core is not available. Please install nvshmem to use NVSHMEMCopyService." + ) + + # Initialize GPU resources (NVSHMEM, device, streams) + self.gpu_resources.init() + + # Initialize logger after PE ID is known + PELogger.init(self.my_pe, level=log_level) + PELogger.info(f"Initializing RemoteCopyService on PE {self.my_pe}/{self.n_pes}") + + # Allocate double-buffered send/recv slots + self.buffer_manager.allocate() + PELogger.debug("Allocated double-buffered send/recv slots") + + # Load CUDA kernels + self.kernel_launcher.load_kernels() + PELogger.debug("Loaded CUDA kernels") + + # Cache CuPy stream wrappers for efficient kernel launching + self.kernel_launcher.set_streams( + self.gpu_resources.pack_stream, self.gpu_resources.unpack_stream + ) + PELogger.debug("Cached CuPy stream wrappers") + + # Create pipeline executor with dependencies + self.pipeline_executor = PipelineExecutor( + self.kernel_launcher, self.buffer_manager, self.my_pe + ) + + # Set streams on pipeline executor + self.pipeline_executor.set_streams( + self.gpu_resources.pack_stream, + self.gpu_resources.unpack_stream, + self.gpu_resources.send_stream, + self.gpu_resources.copy_stream, + self.gpu_resources.torch_pack_stream, + self.gpu_resources.torch_unpack_stream, + self.gpu_resources.torch_copy_stream, + ) + PELogger.info("Initialization complete") + + def register_send( + self, task_id: int, src_tensor, src_pos: int, size: int, dest_pe: int + ) -> None: + """ + Register a send operation. + + Args: + task_id: Unique task identifier + src_tensor: Source tensor (PyTorch/CuPy tensor or pointer) + src_pos: Starting position in source tensor + size: Number of bytes to send + dest_pe: Destination PE rank + """ + if dest_pe >= self.n_pes or dest_pe < 0: + PELogger.error(f"Error: Invalid destination PE {dest_pe}") + return + + req = SendRequest(task_id, src_tensor, src_pos, size, dest_pe) + self.send_requests.append(req) + + def register_receive( + self, task_id: int, dest_tensor, dest_pos: int, size: int, src_pe: int + ) -> None: + """ + Register a receive operation. + + Args: + task_id: Unique task identifier + dest_tensor: Destination tensor (PyTorch/CuPy tensor or pointer) + dest_pos: Starting position in destination tensor + size: Number of bytes to receive + src_pe: Source PE rank + """ + if src_pe >= self.n_pes or src_pe < 0: + PELogger.error(f"Error: Invalid source PE {src_pe}") + return + + req = ReceiveRequest(task_id, dest_tensor, dest_pos, size, src_pe) + self.receive_requests.append(req) + + def schedule(self) -> None: + """ + Build execution schedule. + + Can be called once and followed by multiple run() calls for + repeated execution with the same communication pattern. + + Steps: + 1. Segment large tasks into manageable chunks + 2. Pack tasks into batches + 3. Schedule batches to iterations (conflict-free) + 4. Build GPU execution plans (pointer arrays, chunking) + 5. Create synchronization events + """ + if not self.initialized: + raise RuntimeError("RemoteCopyService not initialized") + + PELogger.info( + f"Starting schedule: {len(self.send_requests)} send requests, " + f"{len(self.receive_requests)} receive requests" + ) + + # Step 1: Segment tasks (break large tasks into chunks) + PELogger.debug("Step 1: Segmenting tasks...") + orig_send_count = len(self.send_requests) + orig_recv_count = len(self.receive_requests) + self._segment_tasks() + PELogger.info( + f"Segmented: {orig_send_count} sends → {len(self.send_requests)} segments, " + f"{orig_recv_count} recvs → {len(self.receive_requests)} segments" + ) + + # Step 2: Pack tasks into workload groups + PELogger.debug("Step 2: Packing workloads...") + workloads = self.workload_packer.pack_workloads(self.send_requests, self.n_pes) + total_batches = sum(len(batches) for batches in workloads.values()) + active_pes = sum(1 for batches in workloads.values() if batches) + PELogger.info(f"Packed: {total_batches} batches across {active_pes} destination PEs") + + # Step 3: Schedule workloads to iterations + PELogger.debug("Step 3: Building communication schedule...") + schedule, global_summaries = self.comm_scheduler.build_schedule( + workloads, self.my_pe, self.n_pes + ) + + self.num_iterations = self.comm_scheduler.num_iterations + PELogger.info(f"Scheduled: {total_batches} batches → {self.num_iterations} iterations") + + # Step 4: Prepare iteration schedules + PELogger.debug("Step 4: Preparing iteration schedules...") + self.iter_schedules = self._prepare_iter_schedules( + schedule, workloads, global_summaries, self.num_iterations + ) + + # Step 5: Build GPU execution plans + PELogger.debug("Step 5: Building GPU execution plans...") + self.gpu_planner.create_gpu_plans( + self.iter_schedules, + self.buffer_manager.send_slots, + self.buffer_manager.recv_slots, + self.receive_requests, + ) + + # Step 6: Create double-buffered events + PELogger.debug("Step 6: Creating synchronization events...") + self.pack_events, self.unpack_events = self.gpu_resources.create_events(num_events=2) + self.pipeline_executor.set_events(self.pack_events, self.unpack_events) + + PELogger.info(f"Schedule complete: {self.num_iterations} iterations ready") + + def run(self) -> None: + """ + Execute the scheduled communication. + + Can be called multiple times after a single schedule() call + to repeat the same communication pattern. + """ + # import torch + # torch.save(self.send_requests, f"send_requests_{torch.distributed.get_rank()}.pt") + # torch.save(self.receive_requests, f"receive_requests_{torch.distributed.get_rank()}.pt") + + if not self.initialized: + raise RuntimeError("RemoteCopyService not initialized") + if self.iter_schedules is None: + raise RuntimeError("Must call schedule() before run()") + + PELogger.info(f"Starting execution: {self.num_iterations} iterations") + + # Start timing + nvtx.range_push("RemoteCopyService.run_total") + + # Global barrier before execution + PELogger.debug("Barrier: Synchronizing all PEs before execution") + nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream) + self.gpu_resources.send_stream.sync() + + # Execute pipelined communication + nvtx.range_push("execute_pipeline") + self.pipeline_executor.execute_pipeline(self.iter_schedules, self.num_iterations) + nvtx.range_pop() # execute_pipeline + + # Global barrier after execution + PELogger.debug("Barrier: Synchronizing all PEs after pipeline") + nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream) + + # Process same-PE transfers + self.pipeline_executor.process_self_moves(self.send_requests, self.receive_requests) + + # End timing range + nvtx.range_pop() # RemoteCopyService.run_total + + def clear_requests(self) -> None: + """ + Clear registered requests and schedule. + + Call this before registering a new set of transfers. + """ + self.send_requests = [] + self.receive_requests = [] + self.iter_schedules = None + self.num_iterations = 0 + self.pack_events = [] + self.unpack_events = [] + + def finalize(self) -> None: + """Cleanup resources.""" + PELogger.info("Finalizing RemoteCopyService") + + # Barrier to ensure all PEs are ready to finalize + try: + PELogger.debug("Barrier: Synchronizing all PEs before finalize") + nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream) + self.gpu_resources.send_stream.sync() + except Exception as e: + PELogger.error(f"Error in final barrier: {e}") + + # Free buffers + self.buffer_manager.free() + + # Finalize GPU resources (this will call nvshmem.core.finalize internally) + self.gpu_resources.finalize() + + PELogger.info("RemoteCopyService finalized") + PELogger.shutdown() + + def _segment_tasks(self) -> None: + """Segment tasks into manageable chunks.""" + new_sends: List[SendRequest] = [] + for req in self.send_requests: + segments = self.task_segmenter.segment_send_request(req) + new_sends.extend(segments) + if len(segments) > 1: + PELogger.debug( + f" Segmented send task {req.task_id}: " + f"{req.size} bytes → {len(segments)} segments" + ) + self.send_requests = new_sends + + new_recvs: List[ReceiveRequest] = [] + for req in self.receive_requests: + segments = self.task_segmenter.segment_receive_request(req) + new_recvs.extend(segments) + if len(segments) > 1: + PELogger.debug( + f" Segmented recv task {req.task_id}: " + f"{req.size} bytes → {len(segments)} segments" + ) + self.receive_requests = new_recvs + + def _prepare_iter_schedules( + self, + schedule_batches: Dict[int, List[ScheduledBatch]], + workloads: Dict[int, List], + global_summaries: Dict[Tuple[int, int, int], WorkloadSummary], + num_iterations: int, + ) -> List[Dict]: + """ + Organize schedule into iteration-based structure. + + Returns: + List of dicts with 'send' and 'recv' keys for each iteration + """ + iter_schedules: List[Dict[str, Optional[ScheduledBatch]]] = [] + + for i in range(num_iterations): + sched: Dict[str, Optional[ScheduledBatch]] = {"send": None, "recv": None} + + if i in schedule_batches: + batches = schedule_batches[i] + + for b in batches: + # Skip same-PE transfers (handled separately by process_self_moves) + if b.src_pe == b.dest_pe: + PELogger.debug( + f" Iter {i}: Skipping same-PE batch " f"({b.src_pe} → {b.dest_pe})" + ) + continue + + if b.src_pe == self.my_pe: + # This PE sends in this iteration + b.tasks = workloads[b.dest_pe][b.batch_index].tasks + b.total_size = workloads[b.dest_pe][b.batch_index].total_size + sched["send"] = b + PELogger.debug( + f" Iter {i}: Send to PE {b.dest_pe}, batch " + f"{b.batch_index}, {len(b.tasks)} tasks, " + f"{b.total_size} bytes" + ) + + elif b.dest_pe == self.my_pe: + # This PE receives in this iteration + key = (b.src_pe, b.dest_pe, b.batch_index) + if key in global_summaries: + summary = global_summaries[key] + b.tasks_summary = summary + b.total_size = summary.total_size + else: + PELogger.error( + f" Iter {i}: Missing workload summary for " + f"recv from PE {b.src_pe}, batch {b.batch_index}" + ) + PELogger.error( + " Available keys in global_summaries: " + f"{list(global_summaries.keys())}" + ) + b.tasks_summary = None + b.total_size = 0 + sched["recv"] = b + PELogger.debug( + f" Iter {i}: Recv from PE {b.src_pe}, batch " + f"{b.batch_index}, {b.total_size} bytes" + ) + + iter_schedules.append(sched) + + return iter_schedules diff --git a/megatron/core/resharding/nvshmem_copy_service/validation.py b/megatron/core/resharding/nvshmem_copy_service/validation.py new file mode 100644 index 00000000000..fafb1321024 --- /dev/null +++ b/megatron/core/resharding/nvshmem_copy_service/validation.py @@ -0,0 +1,145 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +""" +Validation utilities for GPU-to-GPU communication. + +Provides deterministic data generation and validation for verifying + +correctness of communication operations.""" + +from dataclasses import dataclass +from typing import List + +import torch + +from .logger import PELogger + + +@dataclass +class ValidationResult: + """Result of validating a single task.""" + + task_id: int + size: int + passed: bool + src_pe: int = -1 + mismatches: int = 0 + first_mismatch_idx: int = -1 + first_mismatch_expected: int = 0 + first_mismatch_actual: int = 0 + # Scheduling info - which batch/iteration this task was supposed to be handled + batch_index: int = -1 + iteration: int = -1 + + +@dataclass +class ValidationSummary: + """Summary of validation across all tasks.""" + + total_tasks: int + passed_tasks: int + failed_tasks: int + total_bytes: int + results: List[ValidationResult] + + @property + def all_passed(self) -> bool: + """Check if all validated tasks passed.""" + return self.failed_tasks == 0 + + +def generate_deterministic_data(task_id: int, size: int, device: str = "cuda") -> torch.Tensor: + """ + Generate deterministic data pattern for a task. + + Pattern: Each byte = (task_id * 31 + position) % 256 + This creates a unique pattern per task that varies along the data. + + Args: + task_id: Unique task identifier + size: Number of bytes to generate + device: Device to create tensor on ('cuda' or 'cpu') + + Returns: + torch.Tensor of uint8 with deterministic pattern + """ + positions = torch.arange(size, dtype=torch.int64, device=device) + pattern = ((task_id * 31 + positions) % 256).to(torch.uint8) + return pattern + + +def validate_received_data( + task_id: int, tensor: torch.Tensor, size: int, src_pe: int = -1 +) -> ValidationResult: + """ + Validate received data against expected deterministic pattern. + + Args: + task_id: Task identifier to regenerate expected data + tensor: Received tensor to validate + size: Number of bytes to validate + + Returns: + ValidationResult with pass/fail status and details + """ + # Get the data slice to validate + recv_data = tensor[:size] + + # Generate expected pattern on same device + expected = generate_deterministic_data(task_id, size, device=recv_data.device.type) + + # Compare + mismatches_mask = recv_data != expected + num_mismatches = mismatches_mask.sum().item() + + result = ValidationResult( + task_id=task_id, + size=size, + passed=(num_mismatches == 0), + src_pe=src_pe, + mismatches=num_mismatches, + ) + + if num_mismatches > 0: + # Find first mismatch for debugging + first_idx = mismatches_mask.nonzero(as_tuple=True)[0][0].item() + result.first_mismatch_idx = first_idx + result.first_mismatch_expected = expected[first_idx].item() + result.first_mismatch_actual = recv_data[first_idx].item() + + return result + + +def log_validation_summary(summary: ValidationSummary) -> None: + """Log validation summary.""" + if summary.all_passed: + PELogger.info( + "Validation PASSED: %d/%d tasks, %d bytes validated", + summary.passed_tasks, + summary.total_tasks, + summary.total_bytes, + ) + else: + PELogger.error( + "Validation FAILED: %d/%d tasks passed, %d failed", + summary.passed_tasks, + summary.total_tasks, + summary.failed_tasks, + ) + + # Group failures by source PE + failures_by_src = {} + for r in summary.results: + if not r.passed: + failures_by_src.setdefault(r.src_pe, []).append(r) + + PELogger.error(" Failures by source PE:") + for src_pe in sorted(failures_by_src.keys()): + failed_tasks = failures_by_src[src_pe] + task_ids = [r.task_id for r in failed_tasks] + PELogger.error( + " PE %d: %d failed tasks: %s", + src_pe, + len(failed_tasks), + task_ids[:15] if len(task_ids) <= 15 else task_ids[:15] + ["..."], + ) diff --git a/megatron/core/resharding/refit.py b/megatron/core/resharding/refit.py index 491a42b9116..5461b8d3900 100644 --- a/megatron/core/resharding/refit.py +++ b/megatron/core/resharding/refit.py @@ -17,9 +17,45 @@ from .copy_services.base import CopyService from .copy_services.gloo_copy_service import GlooCopyService from .copy_services.nccl_copy_service import NCCLCopyService +from .copy_services.nvshmem_copy_service import NVSHMEMCopyService # Supported refit backend names -RefitBackendName = Literal["nccl", "gloo"] +RefitBackendName = Literal["nccl", "gloo", "nvshmem"] + +# Module-level cache for refit services to avoid repeated allocations +_service_cache: dict[str, CopyService] = {} + + +def get_or_create_service(backend: RefitBackendName) -> CopyService: + """Get or create a cached CopyService instance for the given backend. + + This avoids expensive repeated allocations (especially for NVSHMEM buffers) + when swap_model_weights is called multiple times with the same backend. + """ + if backend in _service_cache: + return _service_cache[backend] + + if backend == "nccl": + service = NCCLCopyService() + elif backend == "gloo": + service = GlooCopyService() + elif backend == "nvshmem": + service = NVSHMEMCopyService() + else: + raise ValueError(f"Unknown backend '{backend}'") + + _service_cache[backend] = service + return service + + +def clear_service_cache(): + """Clear the cached refit services. + + Call this if you need to invalidate the cache, for example when + reinitializing distributed state. + """ + global _service_cache + _service_cache.clear() def swap_model_weights( @@ -37,15 +73,8 @@ def swap_model_weights( service = refit_method reshard_model_weights(src_model, target_model, service=service) elif isinstance(refit_method, str): - if refit_method == "nccl": - service = NCCLCopyService() - reshard_model_weights(src_model, target_model, service=service) - elif refit_method == "gloo": - # Debug / fallback backend: run refit over CPU/Gloo instead of NCCL. - service = GlooCopyService() - reshard_model_weights(src_model, target_model, service=service) - else: - raise ValueError(f"Unknown refit_method '{refit_method}'") + service = get_or_create_service(refit_method) + reshard_model_weights(src_model, target_model, service=service) else: raise TypeError("refit_method must be a str backend name or a CopyService instance") diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 05b2d702aa0..7177ebd00bd 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1977,12 +1977,12 @@ def _add_rl_args(parser): 'Requires --rl-inference-model-unified-memory-level=1.' ), ) - group.add_argument('--refit-method', type=str, default='gloo', - choices=['nccl', 'gloo'], + group.add_argument('--refit-method', type=str, default='nvshmem', + choices=['nccl', 'gloo', 'nvshmem'], help=('Method to refit the model weights between training and inference models during RL. ' 'nccl: use NCCLCopyService to refit using NCCL; ' 'gloo: use GlooCopyService over CPU; ' - )) + 'nvshmem: use NVSHMEMCopyService to refit using the NVSHMEM.')) group.add_argument('--rl-verify-model-weights-swap', action=argparse.BooleanOptionalAction, default=False, help='If set, verify that the model weights were correctly transferred by comparing forward pass outputs on' 'the first swap of model weights.') diff --git a/tests/unit_tests/resharding/test_model_swap.py b/tests/unit_tests/resharding/test_model_swap.py index f5db5cb6185..73296a175ed 100644 --- a/tests/unit_tests/resharding/test_model_swap.py +++ b/tests/unit_tests/resharding/test_model_swap.py @@ -24,6 +24,13 @@ from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils +try: + import nvshmem.core + + has_nvshmem = True +except Exception: + has_nvshmem = False + def _build_pg_collection( tp_size: int, pp_size: int = None, ep_size: int = 1 @@ -116,7 +123,20 @@ def _set_pg_collection(module, tp_group, dp_group): return module -@pytest.mark.parametrize("refit_backend", ["nccl", "gloo"]) +@pytest.mark.parametrize( + "refit_backend", + [ + pytest.param( + "nvshmem", + marks=pytest.mark.skipif( + not has_nvshmem, + reason="nvshmem.core is not available (NVSHMEM Python bindings not installed)", + ), + ), + "nccl", + "gloo", + ], +) @pytest.mark.parametrize( "src_tp,src_pp,src_ep,dst_tp,dst_pp,dst_ep,num_experts", [ From fdc04f6d3ef22fd83172a7a143068f0a1a0dee8d Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 30 Jan 2026 19:42:16 +0000 Subject: [PATCH 009/231] Update copy-pr-bot.yaml [skip ci] --- .github/copy-pr-bot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index f43437d19c0..e305bb25ce4 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] From 9ad5906556be7e6711855bd37b666005387f95d1 Mon Sep 17 00:00:00 2001 From: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Date: Sat, 31 Jan 2026 03:00:35 +0800 Subject: [PATCH 010/231] [Community][Main] fix(moe): Fix theoretical memory calculation of layernorm. (#2434) Co-authored-by: Yuzhong Wang --- megatron/training/theoretical_memory_usage.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/megatron/training/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py index 8737015dfa4..7d4043b62d7 100644 --- a/megatron/training/theoretical_memory_usage.py +++ b/megatron/training/theoretical_memory_usage.py @@ -56,20 +56,23 @@ def compute_weight_and_optimizer_memory(args, verbose=False): mtp_num_moe_layers = 0 mtp_num_dense_layers = 0 + # RMSNorm does not have bias, but LayerNorm has. + norm_size = 1 if args.normalization == "RMSNorm" else 2 + if args.multi_latent_attention: assert not args.group_query_attention if args.q_lora_rank is None: q_term = args.hidden_size * args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) else: ## q lora + rope + q norm - q_term = args.q_lora_rank * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) + 1) + q_term = args.q_lora_rank * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) + norm_size) self_attn_term = ( q_term ## kv lora + rope + kv norm + args.kv_lora_rank - * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim) + 1) + * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim) + norm_size) + args.hidden_size * args.qk_pos_emb_head_dim ## o proj @@ -96,7 +99,7 @@ def compute_weight_and_optimizer_memory(args, verbose=False): # Dense MoE MLP. (args.ffn_hidden_size * gated_linear_multiplier) # Transformer layernorms. - + (2) + + norm_size ) + self_attn_term ) @@ -109,12 +112,12 @@ def compute_weight_and_optimizer_memory(args, verbose=False): # Shared MoE MLP. + (shared_expert_ffn_hidden_size * gated_linear_multiplier) # Transformer layernorms. - + (2) + + norm_size ) + self_attn_term ) embedding_size = args.hidden_size * args.padded_vocab_size - final_layernorm = 2 * args.hidden_size + final_layernorm = norm_size * args.hidden_size if args.untie_embeddings_and_output_weights: num_parameters_in_embedding_layers = 2 * embedding_size else: From 5415e1d3b9276342402173b1320002dbe983c3c7 Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Fri, 30 Jan 2026 15:17:28 -0500 Subject: [PATCH 011/231] fix: Set --refit-method default to gloo (#3172) --- megatron/training/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 7177ebd00bd..46f3c28b1da 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1977,7 +1977,7 @@ def _add_rl_args(parser): 'Requires --rl-inference-model-unified-memory-level=1.' ), ) - group.add_argument('--refit-method', type=str, default='nvshmem', + group.add_argument('--refit-method', type=str, default='gloo', choices=['nccl', 'gloo', 'nvshmem'], help=('Method to refit the model weights between training and inference models during RL. ' 'nccl: use NCCLCopyService to refit using NCCL; ' From a976754b408c9172b43671932bb4ef5a4019efbd Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Sat, 31 Jan 2026 04:52:00 +0800 Subject: [PATCH 012/231] [fix] Bug fix for offloading in evaluate() (#3043) Signed-off-by: Hongbin Liu --- .../core/pipeline_parallel/fine_grained_activation_offload.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 9fbc657d574..08e46a039e2 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -654,6 +654,9 @@ def pop_forward_chunk(self, name=None): while not self._is_warmup and ( self._cur_forward_chunk is None or self._cur_forward_chunk.finish_all_groups(name) ): + if self._cached_chunks_index_forward >= len(self._cached_chunks_forward): + self._cur_forward_chunk = None + break self._cur_forward_chunk = self._cached_chunks_forward[self._cached_chunks_index_forward] self._cached_chunks_index_forward += 1 debug_rank(f"new cur_forward_chunk {self._cur_forward_chunk}") From 991c38ffec442bdd70399613d39cd4d124b2f6f9 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Sat, 31 Jan 2026 00:13:52 +0000 Subject: [PATCH 013/231] Update copy-pr-bot.yaml [skip ci] --- .github/copy-pr-bot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index e305bb25ce4..72a5b915ecc 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] From 5d0a7fd15b5e41ae7d7a1a5cd398385dfb2c632b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 31 Jan 2026 05:45:17 +0100 Subject: [PATCH 014/231] cp: `Fix: nccl-ub in ddp path (3181)` into `main` (#3182) Signed-off-by: Youngeun Kwon Co-authored-by: Youngeun Kwon --- megatron/core/distributed/param_and_grad_buffer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py index b9480533d7a..b192f182d9b 100644 --- a/megatron/core/distributed/param_and_grad_buffer.py +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -762,6 +762,12 @@ def _does_param_require_new_bucket(param): group=self.data_parallel_group, symmetric=not self.ddp_config.disable_symmetric_registration, ) + # Since nccl communicator group is created lazily, we need to perform a warmup call to + # initialize NCCL comm buffers for this dp_group before doing buffer registration. + torch.distributed.barrier() + tmp_warmup_tensor = torch.zeros([1], device="cuda") + torch.distributed.all_reduce(tmp_warmup_tensor, group=self.data_parallel_group) + torch.distributed.barrier() else: # If nccl_ub is False, mem_alloc_context is nullcontext. mem_alloc_context = nullcontext From ffbc43fa352ec29ccb02436e0249de3bb979e2f3 Mon Sep 17 00:00:00 2001 From: Keshav Santhanam Date: Sat, 31 Jan 2026 00:57:06 -0800 Subject: [PATCH 015/231] Miscellaneous inference cleanup (#2955) Signed-off-by: Keshav Santhanam --- .../inference/gpt/gpt_dynamic_inference.py | 339 ++++------------- .../gpt_dynamic_inference_with_coordinator.py | 117 ++---- .../inference/gpt/gpt_static_inference.py | 71 +--- examples/inference/gpt/utils.py | 232 +++-------- examples/rl/README.md | 2 +- .../rl/model_configs/llama3p1_8b_instruct.sh | 3 +- examples/rl/model_configs/nemotron5_56b.sh | 2 +- examples/rl/model_configs/nemotron5_8b.sh | 2 +- .../rl/model_configs/nemotron5p5_12b_H.sh | 2 +- examples/rl/model_configs/nemotron6_3b_moe.sh | 2 +- .../rl/model_configs/qwen3_30b_a3b_moe.sh | 2 +- examples/rl/model_configs/qwen3_32b.sh | 2 +- examples/rl/model_configs/qwen3_4b.sh | 2 +- examples/rl/model_configs/qwen3_8b.sh | 2 +- examples/rl/model_configs/qwen_2p5_32b.sh | 2 +- examples/rl/model_configs/qwen_2p5_3b.sh | 2 +- .../rl/model_configs/qwen_2p5_distill_7b.sh | 2 +- examples/rl/model_configs/qwen_2p5_math_7b.sh | 2 +- megatron/core/inference/config.py | 186 +++++++++ .../attention_context/mamba_metadata.py | 26 +- .../core/inference/contexts/base_context.py | 8 +- .../inference/contexts/dynamic_context.py | 360 ++++++------------ .../core/inference/contexts/static_context.py | 14 +- .../core/inference/engines/dynamic_engine.py | 101 ++--- .../core/inference/engines/static_engine.py | 72 ++-- .../abstract_model_inference_wrapper.py | 209 ++-------- .../gpt/gpt_inference_wrapper.py | 11 +- .../inference_wrapper_config.py | 66 ---- .../t5/t5_inference_wrapper.py | 7 +- .../simple_text_generation_controller.py | 5 - .../text_generation_controller.py | 90 +++-- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/models/mamba/mamba_model.py | 4 +- megatron/core/ssm/mamba_layer.py | 5 +- .../core/transformer/transformer_config.py | 9 + megatron/core/utils.py | 70 ++-- megatron/inference/__init__.py | 1 + megatron/inference/utils.py | 320 ++++++++++++++++ megatron/rl/inference/megatron.py | 167 +------- megatron/training/arguments.py | 18 +- .../model_config.yaml | 1 + .../contexts/test_dynamic_context.py | 122 +++--- .../inference/engines/test_dynamic_engine.py | 71 +--- .../inference/engines/test_static_engine.py | 20 +- .../gpt/test_gpt_inference_wrapper.py | 61 +-- .../t5/test_t5_inference_wrapper.py | 19 +- .../test_model_inference_wrapper_config.py | 21 - .../inference/test_inference_config.py | 17 + .../inference/test_wandb_logging.py | 73 ++-- ...oder_decoder_text_generation_controller.py | 19 +- ....py => test_text_generation_controller.py} | 43 +-- .../test_vlm_text_generation_controller.py | 19 +- tests/unit_tests/models/test_gpt_model.py | 21 +- .../models/test_gpt_model_batch_invariant.py | 80 ++-- tests/unit_tests/models/test_mamba_model.py | 29 +- .../unit_tests/models/test_mamba_moe_model.py | 2 + tools/run_dynamic_text_generation_server.py | 42 +- tools/run_inference_performance_test.py | 180 ++------- tools/run_text_generation_server.py | 27 +- train_rl.py | 3 + 60 files changed, 1346 insertions(+), 2065 deletions(-) create mode 100644 megatron/core/inference/config.py delete mode 100644 megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py delete mode 100644 megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py create mode 100644 megatron/inference/__init__.py create mode 100644 megatron/inference/utils.py delete mode 100644 tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py create mode 100644 tests/unit_tests/inference/test_inference_config.py rename tests/unit_tests/inference/text_generation_controllers/{test_simple_text_generation_controller.py => test_text_generation_controller.py} (96%) diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 88b744b3ac0..7fcac70c11a 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -1,40 +1,31 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# pylint: disable=bad-builtin + import hashlib import io import json -import math import os -import pickle import sys import warnings -import torch -from argparse import ArgumentParser from collections import defaultdict -from functools import partial +from typing import Dict, List, Optional + +import torch from tqdm import tqdm -from typing import Dict, List, Tuple, Optional sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import megatron from examples.inference.gpt.utils import ( Request, - add_common_inference_args, build_dynamic_engine_setup_prefix, build_requests, get_curr_time, get_global_peak_memory_stats_bytes, ) -from megatron.core.inference.contexts.dynamic_context import ( - ContextOverflowError, - DynamicInferenceContext, -) -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, @@ -44,194 +35,26 @@ TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -from megatron.training import get_args, get_model as _get_model, get_tokenizer, initialize_megatron -from megatron.training.checkpointing import load_checkpoint -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder +import logging +import megatron from megatron.core.utils import configure_nvtx_profiling -import logging +from megatron.training import get_args, get_tokenizer, initialize_megatron torch.serialization.add_safe_globals([io.BytesIO]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) -def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Dynamic inference arguments.""" - - add_common_inference_args(parser) - - group = parser.add_argument_group(title='Dynamic inference') - group.add_argument( - "--inference-ckpt-non-strict", - action="store_true", - help="Load checkpoint with `strict=False`.", - ) - group.add_argument( - "--termination-id", type=int, default=None, - help="Termination ID that overrides `tokenizer.eod`.", - ) - group.add_argument( - "--suspend-resume-interval", type=int, default=None, - help="Suspend and resume the dynamic engine every " - "`suspend_resume_interval` steps. This is used to tet the suspend/resume " - "system.", - ) - group.add_argument( - "--inference-repeat-n", type=int, default=1, - help="Repeat inference iterations N times for benchmarking." - ) - group.add_argument( - "--throughput-check-only", - action='store_true', - default=False, - help="If true, only run throughput check without verifying outputs." - ) - - return parser - - -def get_model() -> MegatronModule: - """Initialize model and load checkpoint.""" - - args = get_args() - - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - - # Build model. - model = _get_model( - partial(model_provider, model_builder), - wrap_with_ddp=False - ) - - # Load checkpoint. - assert args.load is not None - args.exit_on_missing_checkpoint = True - load_checkpoint( - ddp_model=model, - optimizer=None, - opt_param_scheduler=None, - strict=not args.inference_ckpt_non_strict, - ) - - # No virtual PP. - assert len(model) == 1, "Above condition should have caught this" - model = model[0] - - # Eval mode. - model.eval() - - return model - - -def get_inference_context( - requests: List[Request], - sampling_params: Optional[SamplingParams] = None, - calculate_max_sequence_length_from_requests: bool = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, -): - """The inference context manages the KV cache and other inference state.""" - - args = get_args() - - # Max sequence length. - if calculate_max_sequence_length_from_requests: - max_gen_length = sampling_params.num_tokens_to_generate - max_context_length = max(len(r.prompt_tokens) for r in requests) - max_sequence_length = max_context_length + max_gen_length - else: - max_sequence_length = args.inference_max_seq_length - - metrics_writer = None - if args.inference_logging_step_interval > 0 and args.inference_wandb_logging: - metrics_writer = get_wandb_writer() - - # Inference context. - context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers // args.pipeline_model_parallel_size, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=max_sequence_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, - metrics_writer=metrics_writer, - offload_kv_cache=args.rl_offload_kv_cache_during_training - ) - - return context - - -def get_inference_controller( - model: MegatronModule, context: DynamicInferenceContext -) -> TextGenerationController: - """Buid text generation controller, which manages the model inference context. - - Args: - model (MegatronModule): Megatron GPT model. - context (DynamicInferenceContext): Context for managing KV cache blocks. - - Return: - (TextGenerationController) Inference text generation controller. - """ - - args = get_args() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - # Wrap model in inference wrapper. - model = GPTInferenceWrapper(model, args, context) - - # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). - from megatron.core import parallel_state - - model.model_is_pipeline_parallel = not ( - parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() - ) - - # Text generation controller. - controller = TextGenerationController(model, tokenizer) - - return controller - - def run_inference( requests: List[Request], engine: DynamicInferenceEngine, @@ -284,11 +107,7 @@ def _add_request(): """ nonlocal num_requests_added _request = requests[num_requests_added] - engine.add_request( - num_requests_added, - _request.prompt_text, - _request.sampling_params, - ) + engine.add_request(num_requests_added, _request.prompt_text, _request.sampling_params) _request.time_start = get_curr_time() _request.state = "started" num_requests_added += 1 @@ -305,10 +124,9 @@ def _add_request(): _add_request() else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added, - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): _add_request() add_times.append(get_curr_time() - add_start) @@ -318,11 +136,12 @@ def _add_request(): result = engine.step_modern() except EngineSuspendedError as e: result = e - pass # ignore error in order to call 'engine.resume()' below. + pass # ignore error in order to call 'engine.resume()' below. attempted_step_count += 1 - # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine - is_decode_only = engine.is_decode_only + # After step, we lost track of last iteration's is_decode_only, + # so we need to get it from the engine + is_decode_only = engine.is_decode_only # Test suspending and resuming engine. if args.suspend_resume_interval is not None: @@ -335,9 +154,9 @@ def _add_request(): # Resume, 0+ attempted steps later. if ( attempted_step_count > 0 - and - (attempted_step_count - args.suspend_resume_interval // 2) - % args.suspend_resume_interval == 0 + and (attempted_step_count - args.suspend_resume_interval // 2) + % args.suspend_resume_interval + == 0 ): print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count)) engine.resume() @@ -349,7 +168,9 @@ def _add_request(): # Record cuda_graph_request_count. cuda_graph_request_count = result["cuda_graph_request_count"] if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None: - cuda_graph_request_count_map[cuda_graph_request_count] = cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + cuda_graph_request_count_map[cuda_graph_request_count] = ( + cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + ) # Update requests. active_request_ids = result["active_request_ids"] @@ -408,29 +229,29 @@ def _add_request(): engine.resume() return { - "step_times" : step_times, - "add_times" : add_times, - "output_times" : output_times, - "total_output_tokens" : total_output_tokens, - "cuda_graph_request_count_map" : cuda_graph_request_count_map, + "step_times": step_times, + "add_times": add_times, + "output_times": output_times, + "total_output_tokens": total_output_tokens, + "cuda_graph_request_count_map": cuda_graph_request_count_map, } @torch.inference_mode() def main(): - + """Run dynamic inference.""" # Initialize Megatron. initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) # Start Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - - level_str = os.getenv("LOG_LEVEL", "INFO").upper() - level = getattr(logging, level_str, logging.INFO) + + level_str = os.getenv("LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_str, logging.INFO) logging.basicConfig(level=level, force=True) configure_nvtx_profiling(True) @@ -456,42 +277,36 @@ def main(): termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, top_n_logprobs=args.top_n_logprobs, stop_words=args.stop_words, - ) - - model = get_model() + ) - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() # Requests, context, controller. requests = build_requests(args, tokenizer, sampling_params) - context = get_inference_context( - requests, - sampling_params, - mamba_inference_state_config=mamba_inference_state_config, - ) - controller = get_inference_controller(model, context) + inference_config = get_inference_config_from_model_and_args(model, args) + + # Calculate max_sequence_length from requests + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + inference_config.max_sequence_length = max_context_length + max_gen_length + context = DynamicInferenceContext(model.config, inference_config) + wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(wrapped_model, tokenizer) # Validate all context_length's <= max_tokens. - if args.disable_chunked_prefill: + if not args.enable_chunked_prefill: invalid_prompt_length_map = {} for request_idx, request in enumerate(requests): if len(request.prompt_tokens) > context.max_tokens: invalid_prompt_length_map[request_idx] = len(request.prompt_tokens) - assert not invalid_prompt_length_map, ( - "request idxs with prompts longer than context.max_tokens: " - ", ".join(f"{k}({v})" for k, v in invalid_prompt_length_map.items()) + assert ( + not invalid_prompt_length_map + ), "request idxs with prompts longer than context.max_tokens: " ", ".join( + f"{k}({v})" for k, v in invalid_prompt_length_map.items() ) # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = DynamicInferenceEngine(controller, context) setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") @@ -522,14 +337,13 @@ def main(): # Validate all requests finished. for request in requests: - assert request.state == "finished", ( - f"request.state == '{request.state}' != 'finished'." - ) + assert request.state == "finished", f"request.state == '{request.state}' != 'finished'." peak_mem_stats = get_global_peak_memory_stats_bytes() # Print unique prompts + outputs. if torch.distributed.get_rank() == 0: + def escape_str(s): return s.replace("\n", "\\n") @@ -547,7 +361,10 @@ def escape_str(s): # ---- Prompt summary line ---- prompt_len = len(requests[request_idxs[0]].prompt_tokens) escaped_prompt_text = escape_str(prompt_text) - print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}") + print( + f"\n{unique_idx+1}/{len(unique_prompt_map)}" + f"[n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}" + ) # ---- Group all outputs for this prompt ---- output_map = defaultdict(list) @@ -567,16 +384,17 @@ def escape_str(s): # Use hash of prompt + generated text in case engine was # suspended and resumed, which misaligns boundary between # prompt and generated tokens. - o_hash = hashlib.sha256( - (prompt_text + output_text).encode() - ).hexdigest()[:6] + o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] o_len = len(requests[output_request_idxs[0]].output_tokens) escaped_output_text = escape_str(output_text) else: o_hash = "--" o_len = 0 escaped_output_text = "--" - print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}{', ' if evicted else ''}] {escaped_output_text}") + print( + f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" + f"{', ' if evicted else ''}] {escaped_output_text}" + ) text_hashes.append(o_hash) # Write results to JSON. Primarily used for functional testing. @@ -592,14 +410,16 @@ def escape_str(s): "generated_text": req.output_text, "generated_tokens": req.output_tokens, "latency": req.time_end - req.time_start, - "cuda_graph_request_count_map" : result["cuda_graph_request_count_map"], - "step_count" : engine.step_count, - "top_n_logprobs" : getattr(req, 'generated_top_n_logprobs', None), - "prompt_top_n_logprobs" : getattr(req, 'prompt_top_n_logprobs', None), + "cuda_graph_request_count_map": result["cuda_graph_request_count_map"], + "step_count": engine.step_count, + "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), } if req.sampling_params.return_log_probs: result_dict["prompt_logprobs"] = getattr(req, 'prompt_log_probs', None) - result_dict["generated_logprobs"] = getattr(req, 'generated_log_probs', None) + result_dict["generated_logprobs"] = getattr( + req, 'generated_log_probs', None + ) result_dict["logprobs"] = getattr(req, 'logprobs', None) json_results[req.request_id] = result_dict @@ -631,7 +451,7 @@ def escape_str(s): d_count = len(d_times) p_mean = p_total / p_count - d_mean = d_total / d_count if d_count != 0 else 0. + d_mean = d_total / d_count if d_count != 0 else 0.0 # Commented out for now as the step/add/output times are not calculated correctly. # print( @@ -643,18 +463,13 @@ def escape_str(s): # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " # f"count [ p {p_count}, d {d_count} ]." # ) - capture_str = ( - f"{engine.capture_stats['time']:.2f} sec" - if engine.capture_stats else - "--" - ) + capture_str = f"{engine.capture_stats['time']:.2f} sec" if engine.capture_stats else "--" print( - f"{setup_prefix} … " - f"throughput: {throughput:.3f} tok/s … ", + f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", f"total time: {total_time:.3f}s … " f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " f"steps: {engine.step_count:d} … " - f"capture {capture_str}" + f"capture {capture_str}", ) print("~~~") diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index cbb7a1aa745..ab84ee5bf5c 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -2,43 +2,33 @@ import asyncio import json +import logging import os import time -import torch -import torch.distributed as dist +import warnings from collections import defaultdict -from tqdm import tqdm from typing import List -import warnings -import logging -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) -from examples.inference.gpt.utils import ( - Request, - build_dynamic_engine_setup_prefix, - build_requests, - add_common_inference_args -) +import torch +import torch.distributed as dist -from megatron.core import parallel_state +from examples.inference.gpt.utils import Request, build_dynamic_engine_setup_prefix, build_requests from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.inference_request import DynamicInferenceRequestRecord from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.utils import get_mamba_inference_state_config_from_model - +from megatron.inference.utils import ( + add_inference_args, + get_dynamic_inference_engine, + get_model_for_inference, +) from megatron.training import get_args, get_tokenizer, initialize_megatron -from megatron.training.arguments import parse_args # pylint: disable=line-too-long logging.basicConfig(level=logging.INFO, force=True) + async def main( engine: DynamicInferenceEngine, requests: List[Request], @@ -51,12 +41,11 @@ async def main( "Sampling parameters are specified per request.", DeprecationWarning, ) - + # once you call engine.start_listening_to_data_parallel_coordinator, # the engine will start accepting requests from the data parallel coordinator. # and processing them in an asyncio coroutine. # leaving inference_coordinator_port as None will find a free port automatically. - dp_addr = await engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=port, launch_inference_coordinator=True, @@ -69,14 +58,11 @@ async def main( # Since the client doesn't directly call engine.async_step here, we test # the suspend-resume system ~4 times. suspend_resume_interval = max(1, len(requests) // 4) - suspend_idxs = set(range( - suspend_resume_interval, - len(requests) + 1, - suspend_resume_interval, - )) + suspend_idxs = set( + range(suspend_resume_interval, len(requests) + 1, suspend_resume_interval) + ) resume_idxs = set( - min(len(requests), i + suspend_resume_interval // 2) - for i in suspend_idxs + min(len(requests), i + suspend_resume_interval // 2) for i in suspend_idxs ) else: suspend_idxs = set() @@ -98,7 +84,10 @@ async def main( current_time = time.time_ns() / 10**9 if args.incoming_requests_per_step is None: # Only add requests that have arrived at the current time. - while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: + while ( + num_requests_added < num_requests_total + and requests[num_requests_added].time_arrival <= current_time + ): request = requests[num_requests_added] # These add-request calls will queue up the request on a zmq socket and return # instantaneously. They will return an asyncio future which can be awaited for @@ -114,10 +103,9 @@ async def main( else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): # Change sampling parameters to force different generation lengths. request = requests[num_requests_added] n = request.sampling_params.num_tokens_to_generate @@ -135,7 +123,7 @@ async def main( break # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. await asyncio.sleep(0) - + # While we wait for the requests to complete, the engine runs in the background. results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) @@ -170,16 +158,19 @@ async def main( req = record.merge() unique_prompt_map[req.prompt].append(req) for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): - print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % ( - idx, - len(unique_prompt_map), - prompt_text.replace("\n", "\\n"), - len(reqs), - reqs[0].generated_text.replace("\n", "\\n"), - )) + print( + f"%d/%d. prompt '%s' ... [%d] output '%s'." + % ( + idx, + len(unique_prompt_map), + prompt_text.replace("\n", "\\n"), + len(reqs), + reqs[0].generated_text.replace("\n", "\\n"), + ) + ) # kill the engines and suspend the client - # Right now, we can only call stop when all requests are done. + # Right now, we can only call stop when all requests are done. # Todo: Make this explicit in the Client class.... await client.stop_engines() client.stop() @@ -190,11 +181,11 @@ async def main( if __name__ == "__main__": - # enable inference mode in the very beginning as some fp-8 optimizations + # enable inference mode in the very beginning as some fp8 optimizations # check for it. with torch.inference_mode(): initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) @@ -213,34 +204,16 @@ async def main( ), ) - # Requests, context, conroller. - model = get_model() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() + requests = ( build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None ) - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) - - controller = get_inference_controller(model, context) - - # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = get_dynamic_inference_engine(model=model) if dist.get_rank() == 0: - setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) + setup_prefix = build_dynamic_engine_setup_prefix(args, model, engine.context, requests) print("~~~") print(setup_prefix) print("~~~") @@ -249,13 +222,7 @@ async def main( if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - asyncio.run( - main( - engine, - requests, - args.inference_coordinator_port, - ) - ) + asyncio.run(main(engine, requests, args.inference_coordinator_port)) # Stop Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index 03a60927ab2..298ebfebd86 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -1,21 +1,11 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import os -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder -import torch import sys import time -import warnings -from functools import partial from argparse import Namespace import torch -import tqdm from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine @@ -23,17 +13,12 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule -from pretrain_gpt import model_provider as gpt_model_provider -from pretrain_mamba import model_provider as mamba_model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) @@ -41,18 +26,18 @@ import asyncio import json -from typing import Any, AsyncIterator, List +from typing import List -from examples.inference.gpt.utils import add_common_inference_args, build_requests -from megatron.core import mpu -from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 -from megatron.training.checkpointing import load_checkpoint +from examples.inference.gpt.utils import build_requests +from megatron.inference.utils import add_inference_args, get_model_for_inference +from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron.training.initialize import initialize_megatron + def add_static_inference_args(parser): """Static inference arguments.""" - add_common_inference_args(parser) + add_inference_args(parser) group = parser.add_argument_group(title='Static inference') group.add_argument( @@ -83,30 +68,16 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere tokenizer = get_tokenizer() else: tokenizer = build_tokenizer(args) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_requests=args.inference_max_batch_size, - inference_max_seq_length=args.inference_max_seq_length, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - fp8=args.fp8, - moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context + inference_context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_seq_length ) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) engine_kwargs = { - "text_generation_controller" : text_generation_controller, - "legacy" : args.use_legacy_static_engine, + "text_generation_controller": text_generation_controller, + "legacy": args.use_legacy_static_engine, } if not args.use_legacy_static_engine: engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb @@ -165,22 +136,7 @@ def main(): args = get_args() - if args.max_batch_size is not None: - warnings.warn( - f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`." - ) - args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size) - - # Set up model and load checkpoint - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - model = get_model(partial(model_provider, model_builder), wrap_with_ddp=False) - load_checkpoint(model, None, None, strict=False) - model = model[0] + model = get_model_for_inference() inference_engine = get_inference_engine(args, model) @@ -276,7 +232,7 @@ def main(): ) ), len(requests), - args.inference_max_batch_size, + args.inference_max_requests, stats["allocated_bytes.all.peak"] / (1024**3), stats["reserved_bytes.all.peak"] / (1024**3), latency, @@ -293,6 +249,5 @@ def main(): torch.distributed.destroy_process_group() - if __name__ == "__main__": main() diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py index a04b856c0a6..b7a3977605c 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/gpt/utils.py @@ -1,158 +1,23 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy -import json import itertools +import json import random import time -import torch from argparse import ArgumentParser, Namespace -from tqdm import tqdm +from functools import partial from typing import Any, List, Optional -from megatron.core.inference.inference_request import DynamicInferenceRequest +import torch +from tqdm import tqdm + from megatron.core.inference.contexts import DynamicInferenceContext from megatron.core.inference.contexts.dynamic_context import get_mem_size_str -from megatron.core.transformer.module import MegatronModule - +from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams - - -def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Common inference arguments.""" - - group = parser.add_argument_group(title='Common inference') - - group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') - group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') - group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') - group.add_argument( - "--return-log-probs", - action='store_true', - default=False, - help='Return the log probabilities of the final output tokens', - ) - group.add_argument( - "--prompts", - metavar='N', - type=str, - nargs='+', - help='Input prompts with each prompt within quotes and seperated by space', - ) - group.add_argument( - "--num-tokens-to-prompt", - type=int, - nargs="+", - default=[64, 1024], - help='Number of tokens to use for simulated prompts. This should be a ' - 'space-separated pair of integers, and the generated prompt lengths will ' - 'be uniformly sampled within this range.', - ) - group.add_argument( - "--num-tokens-to-generate", - type=int, - default=30, - help='Number of tokens to generate for each prompt', - ) - group.add_argument( - "--num-tokens-from-file", - action='store_true', - default=False, - help='Use per-prompt num_tokens_to_generate from prompt file', - ) - group.add_argument( - "--top-n-logprobs", - type=int, - default=0, - help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary', - ) - group.add_argument( - "--incoming-requests-per-step", - type=int, default=None, - help="Add a deterministic number of requests per step. This arg is " - "prioritized over `--incoming-requests-per-sec` below (which is non-" - "deterministic). Note that the number of requests added per step is " - "additionally limited by the inference context's `max_requests`, " - "`max_tokens`, and KV buffer size.", - ) - group.add_argument( - "--incoming-requests-per-sec", - type=float, - default=100.0, - help="Simulated number of requests per second. Set to -1 to add all requests together.", - ) - group.add_argument( - "--incoming-requests-duration", - type=float, - default=10.0, - help="Total amount of time to simulate that requests are " - "arriving. Multiply this value with " - "`--incoming-requests-per-sec` to get the approximate " - "total number of requests. Set to -1 to add all requests together.", - ) - group.add_argument( - "--model-provider", - choices=["mamba", "gpt"], - default="gpt", - help="Model provider", - ) - group.add_argument( - "--skip-prompt-log-probs", - action='store_true', - default=False, - help='Skip prompt log probs.', - ) - group.add_argument( - "--stop-words", - metavar='WORD', - type=str, - nargs='+', - default=None, - help='Stop words to terminate generation. Each word should be quoted and ' - 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', - ) - group.add_argument( - "--output-path", - type=str, - default=None, - help="Path to save generations as JSON", - ) - group.add_argument( - "--output-every-n-results", - type=int, - default=1, - help="To minimize the output file size of larger runs, only write the " - "results of every `n` requests.", - ) - group.add_argument( - "--prompt-file", - help='Jsonl file containing input prompts, where each item (i.e., line) ' - 'contains the field \'text\' where the value is the prompt. All other ' - 'fields within each item are ignored, and may be customized for each ' - 'application.', - ) - group.add_argument( - "--prompt-file-num-truncate", - type=int, - help='Number of samples to use from the loaded prompt file (see ' - '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' - 'will be used, in order.', - ) - group.add_argument( - "--use-flashinfer-fused-rope", - action='store_true', - default=False, - help='Use flashinfer fused rope implementation.', - ) - group.add_argument( - "--no-record-throughput", - action='store_false', - dest="record_throughput", - help="Disable throughput recording in --output-file" - - ) - - return parser +from megatron.core.transformer.module import MegatronModule +from megatron.training import get_args def get_default_sampling_params(termination_id: int = None): @@ -162,9 +27,10 @@ def get_default_sampling_params(termination_id: int = None): top_p=0.0, return_log_probs=False, num_tokens_to_generate=30, - termination_id = termination_id, + termination_id=termination_id, ) + def get_curr_time() -> float: """Get synchronized time across ranks.""" curr_time = torch.cuda.LongTensor([time.time_ns()]) @@ -188,7 +54,13 @@ class Request: tokenizer (Any): Tokenizer for tokenizing the prompt. """ - def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, sampling_params: SamplingParams = None): + def __init__( + self, + prompt_text: str, + time_offset: float, + tokenizer: Any, + sampling_params: SamplingParams = None, + ): self.prompt_text = prompt_text self.prompt_tokens = tokenizer.tokenize(prompt_text) self.output_text = None @@ -198,7 +70,11 @@ def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, samplin self.time_start = None self.time_end = None self.state = "not-started" - self.sampling_params: SamplingParams = sampling_params if sampling_params is not None else get_default_sampling_params(tokenizer.eod) + self.sampling_params: SamplingParams = ( + sampling_params + if sampling_params is not None + else get_default_sampling_params(tokenizer.eod) + ) self.sampling_params = copy.deepcopy(self.sampling_params) def __str__(self) -> str: @@ -225,10 +101,10 @@ def get_time_offsets( # if num_requests is not None: incoming_requests_duration = num_requests / incoming_requests_per_sec - incoming_requests_duration *= 2 # extra margin, to accomodate time sampling + incoming_requests_duration *= 2 # extra margin, to accomodate time sampling random.seed(seed) - + import simpy # Guard against this import in test case # Generate random time offsets. @@ -241,14 +117,14 @@ def arrival(r): env = simpy.Environment() env.process(arrival(incoming_requests_per_sec)) env.run(incoming_requests_duration) - + # Ensure at least a single request. if len(time_offsets) == 0: time_offsets = [0.0] # Ensure first time is 0. time_offsets = [to - time_offsets[0] for to in time_offsets] - + # Truncate to num_requests. assert len(time_offsets) >= num_requests time_offsets = time_offsets[:num_requests] @@ -257,7 +133,7 @@ def arrival(r): def get_cli_requests( - args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None ) -> list[Request]: # Get time offsets. @@ -269,7 +145,7 @@ def get_cli_requests( ) # Init requests. - requests = [Request(p, t, tokenizer, sampling_params) for p,t in zip(args.prompts, t_offsets)] + requests = [Request(p, t, tokenizer, sampling_params) for p, t in zip(args.prompts, t_offsets)] return requests @@ -289,18 +165,14 @@ def get_synthetic_requests( # Build prompts with expected lengths. assert ( len(args.num_tokens_to_prompt) == 2 - and - args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] + and args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] ) max_prompt_length = args.num_tokens_to_prompt[1] max_prompt_text = "hi " * max_prompt_length max_prompt_tokens = tokenizer.tokenize(max_prompt_text) - prompt_lengths = [ - random.randint(*args.num_tokens_to_prompt) - for _ in time_offsets - ] - prompt_tokens_list = [ max_prompt_tokens[:l] for l in prompt_lengths ] - prompt_texts = [ tokenizer.detokenize(tt) for tt in prompt_tokens_list ] + prompt_lengths = [random.randint(*args.num_tokens_to_prompt) for _ in time_offsets] + prompt_tokens_list = [max_prompt_tokens[:l] for l in prompt_lengths] + prompt_texts = [tokenizer.detokenize(tt) for tt in prompt_tokens_list] # Init requests. assert len(prompt_texts) == len(time_offsets) @@ -340,16 +212,15 @@ def get_requests_from_file( # Get time offsets. time_offsets: list[float] = get_time_offsets( - args.seed, - args.incoming_requests_per_step, - args.incoming_requests_per_sec, - len(prompts), + args.seed, args.incoming_requests_per_step, args.incoming_requests_per_sec, len(prompts) ) # Init requests. requests = [ Request(p, t, tokenizer, sp) - for p, t, sp in tqdm(zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts)) + for p, t, sp in tqdm( + zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts) + ) ] return requests @@ -411,19 +282,21 @@ def build_dynamic_engine_setup_prefix( # Prompt description prompt_src_str = ( - "cli" if args.prompts else - "file" if args.prompt_file else - f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + "cli" + if args.prompts + else ( + "file" + if args.prompt_file + else f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + ) ) request_str = ( - f"requests: {prompt_src_str}, " - f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " + f"requests: {prompt_src_str}, " f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " ) request_str += ( - f"dur {args.incoming_requests_duration:.1e} " - f"r/sec {args.incoming_requests_per_sec:.1e}" - if args.incoming_requests_per_step is None else - f"r/step {args.incoming_requests_per_step}" + f"dur {args.incoming_requests_duration:.1e} " f"r/sec {args.incoming_requests_per_sec:.1e}" + if args.incoming_requests_per_step is None + else f"r/step {args.incoming_requests_per_step}" ) # Buffer limits config @@ -433,14 +306,7 @@ def build_dynamic_engine_setup_prefix( f"[r {context.max_requests}, t {context.max_tokens}]" ) - parts = [ - get_model_size_str(model), - "dynamic", - cg_str, - uvm_str, - request_str, - buffer_limits_str, - ] + parts = [get_model_size_str(model), "dynamic", cg_str, uvm_str, request_str, buffer_limits_str] return " | ".join(parts) @@ -456,4 +322,4 @@ def get_global_peak_memory_stats_bytes() -> dict: t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64) torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) peak_alloc = int(t[0].item()) - return {"mem-max-allocated-bytes": peak_alloc} \ No newline at end of file + return {"mem-max-allocated-bytes": peak_alloc} diff --git a/examples/rl/README.md b/examples/rl/README.md index 34b6fafa517..9c2de3ec088 100644 --- a/examples/rl/README.md +++ b/examples/rl/README.md @@ -94,7 +94,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/llama3p1_8b_instruct.sh b/examples/rl/model_configs/llama3p1_8b_instruct.sh index 24d285a6cf7..5398dad1a4e 100644 --- a/examples/rl/model_configs/llama3p1_8b_instruct.sh +++ b/examples/rl/model_configs/llama3p1_8b_instruct.sh @@ -77,7 +77,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --add-qkv-bias \ --normalization RMSNorm \ @@ -101,6 +101,7 @@ MODEL_OPTIONS="\ --max-position-embeddings 131072 \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model unsloth/Meta-Llama-3.1-8B-Instruct \ + --legacy-tokenizer \ --langrl-inference-server-type "inplace_megatron_chat" \ --langrl-inference-server-conversation-template "unsloth/Meta-Llama-3.1-8B-Instruct" \ --lr 3e-7 \ diff --git a/examples/rl/model_configs/nemotron5_56b.sh b/examples/rl/model_configs/nemotron5_56b.sh index fd2cc4f7212..741cd054b73 100644 --- a/examples/rl/model_configs/nemotron5_56b.sh +++ b/examples/rl/model_configs/nemotron5_56b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-format hybrid \ --fp8-amax-history-len 1 \ diff --git a/examples/rl/model_configs/nemotron5_8b.sh b/examples/rl/model_configs/nemotron5_8b.sh index 7b8947ae763..753d4e493a2 100644 --- a/examples/rl/model_configs/nemotron5_8b.sh +++ b/examples/rl/model_configs/nemotron5_8b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \ --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ diff --git a/examples/rl/model_configs/nemotron5p5_12b_H.sh b/examples/rl/model_configs/nemotron5p5_12b_H.sh index 9e97051e087..adbcc8d03f0 100644 --- a/examples/rl/model_configs/nemotron5p5_12b_H.sh +++ b/examples/rl/model_configs/nemotron5p5_12b_H.sh @@ -65,7 +65,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-recipe blockwise \ --fp8-format e4m3 \ diff --git a/examples/rl/model_configs/nemotron6_3b_moe.sh b/examples/rl/model_configs/nemotron6_3b_moe.sh index eff4f6cf0b3..7d98f4eda63 100644 --- a/examples/rl/model_configs/nemotron6_3b_moe.sh +++ b/examples/rl/model_configs/nemotron6_3b_moe.sh @@ -85,7 +85,7 @@ MODEL_OPTIONS="\ --rl-importance-sampling-truncation-coef 10.0 \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --distributed-timeout-minutes 60 \ --use-mcore-models \ diff --git a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh index 775a9587ba4..eb55ba35cc6 100644 --- a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh +++ b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh @@ -37,7 +37,7 @@ ENV_DEPENDENT="\ MODEL_OPTIONS=" --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ ---inference-max-batch-size $MAX_INFERENCE_BS \ +--inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --no-use-tokenizer-model-from-checkpoint-args \ --seq-length 8192 \ diff --git a/examples/rl/model_configs/qwen3_32b.sh b/examples/rl/model_configs/qwen3_32b.sh index cd153a04f3c..c06c5f55b53 100644 --- a/examples/rl/model_configs/qwen3_32b.sh +++ b/examples/rl/model_configs/qwen3_32b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 64 \ diff --git a/examples/rl/model_configs/qwen3_4b.sh b/examples/rl/model_configs/qwen3_4b.sh index da238511fd3..6f6c6b6bf57 100644 --- a/examples/rl/model_configs/qwen3_4b.sh +++ b/examples/rl/model_configs/qwen3_4b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --num-layers 36 \ --hidden-size 2560 \ diff --git a/examples/rl/model_configs/qwen3_8b.sh b/examples/rl/model_configs/qwen3_8b.sh index 6758cd84c3d..54ff7385331 100644 --- a/examples/rl/model_configs/qwen3_8b.sh +++ b/examples/rl/model_configs/qwen3_8b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 36 \ diff --git a/examples/rl/model_configs/qwen_2p5_32b.sh b/examples/rl/model_configs/qwen_2p5_32b.sh index d82972ba477..2a2a9ae2420 100644 --- a/examples/rl/model_configs/qwen_2p5_32b.sh +++ b/examples/rl/model_configs/qwen_2p5_32b.sh @@ -59,7 +59,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_3b.sh b/examples/rl/model_configs/qwen_2p5_3b.sh index 246afae6ad2..f3250f39ecc 100644 --- a/examples/rl/model_configs/qwen_2p5_3b.sh +++ b/examples/rl/model_configs/qwen_2p5_3b.sh @@ -62,7 +62,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --disable-bias-linear \ --add-qkv-bias \ diff --git a/examples/rl/model_configs/qwen_2p5_distill_7b.sh b/examples/rl/model_configs/qwen_2p5_distill_7b.sh index 149ac77965f..1438bca0726 100644 --- a/examples/rl/model_configs/qwen_2p5_distill_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_distill_7b.sh @@ -44,7 +44,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_math_7b.sh b/examples/rl/model_configs/qwen_2p5_math_7b.sh index 1d631fa80a5..b598bb127bd 100644 --- a/examples/rl/model_configs/qwen_2p5_math_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_math_7b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py new file mode 100644 index 00000000000..5970b4f14f6 --- /dev/null +++ b/megatron/core/inference/config.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import torch + +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_attr_wrapped_model + + +@dataclass +class MambaInferenceStateConfig: + """ + Config for initializing Mamba model inference state tensors. + + Note that we maintain separate metadata for decode, regular prefill, and + chunked prefill requests because the Mamba kernels do not yet support mixing + these. Once the kernels have been updated we can simplify this code. + """ + + layer_type_list: List[str] + """ + A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. + See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. + """ + + mamba_conv_states_shape: Tuple[int] + """Mamba conv states shape per request.""" + + mamba_ssm_states_shape: Tuple[int] + """Mamba ssm states shape per request.""" + + @classmethod + def from_model(cls, model: MegatronModule) -> Optional["MambaInferenceStateConfig"]: + """Returns Mamba inference state config from the model if it is a hybrid model.""" + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols + + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = ( + decoder.mamba_state_shapes_per_request() + ) + return cls( + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, + ) + return None + + +@dataclass +class InferenceConfig: + """ + Config for inference. + + NOTE: Must remain mutually exclusive with the `TransformerConfig`. + """ + + # ================================= + # KV cache config + # ================================= + block_size_tokens: int = 256 + """Size of KV cache block size.""" + + buffer_size_gb: int = 20 + """ + Buffer size reserved on the GPU for the KV cache. + If `unified_memory_level` >= 1, then CPU memory is additionally utilized, resulting in a total + buffer size of `buffer_size_gb + paused_buffer_size_gb`. + """ + + paused_buffer_size_gb: Optional[int] = None + """ + Portion of buffer reserved for paused requests. Active requests are paused when there are not + enough active blocks available to continue generating a request. The total buffer size + (active + paused) depends on `unified_memory_level` (uvm): + - uvm 0: buffer_size_gb (paused buffer is inclusive) + - uvm 1: buffer_size_gb + paused_buffer_size_gb + """ + + max_requests: Optional[int] = None + """ + Max number of active requests to use for decode-only forward passes. + This is primarily limited by the combination of `buffer_size_gb` and `max_sequence_length`. + """ + + max_tokens: Optional[int] = None + """ + Max number of tokens to use for forward passes. This is primarily limited by prefill activation + memory usage. (Defaults to 16384). + """ + + unified_memory_level: int = 0 + """ + Sets unified memory usage within the dynamic inference context. + The levels are: + 0) no unified memory (default) + 1) allocate `memory_buffer` in unified memory. + Eventually, additional levels will be included to control other tensors within the context. + """ + + offload_kv_cache: bool = False + """If True, offload KV cache during RL training.""" + + # ================================= + # CUDA graph config + # ================================= + num_cuda_graphs: Optional[int] = None + """ + Maximum number of cuda graphs to capture, where the cuda graph batch sizes range from 1 to + `max_requests`. Due to rounding, the actual number of cuda graphs may not equal this argument. + """ + + cuda_graph_mixed_prefill_count: Optional[int] = 16 + """ + The number of mixed prefill graphs to capture if mixed prefill/decode graphs are enabled. + """ + + use_cuda_graphs_for_non_decode_steps: bool = True + """ + Whether to use CUDA graphs for non-decode steps. + """ + + persist_cuda_graphs: bool = False + """ + Whether to persist CUDA graphs when the engine is suspended. + If False and `unified_memory_level` is 0, CUDA graphs are deleted on `suspend()` + and re-captured on `resume()` to save memory. + """ + + # ================================= + # Model config + # ================================= + max_sequence_length: int = 2560 + """Max possible sequence length (prompt + output) that will occur.""" + + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None + """The Mamba inference state config if the model is a hybrid model.""" + + pg_collection: Optional[ProcessGroupCollection] = None + """A `ProcessGroupCollection` for distributed execution.""" + + use_flashinfer_fused_rope: Optional[bool] = False + """ + If True, use flashinfer's fused rope implementation. + If None, defaults to using flash-infer if available. + """ + + materialize_only_last_token_logits: bool = True + """ + Whether to only materialize logits for the last token. This should be set to False + if returning log probs. + """ + + # ================================= + # Engine config + # ================================= + enable_chunked_prefill: bool = False + """Whether to enable chunked prefill.""" + + # ================================= + # Logging config + # ================================= + track_paused_request_events: bool = False + """ + Whether to track paused request events. If True, `add_event_pause()` is called on + requests when they are paused during bookkeeping. + """ + + metrics_writer: Optional["WandbModule"] = None + """Wandb module for writing metrics.""" + + logging_step_interval: int = 0 + """ + The step interval at which to log inference metrics to wandb. + Defaults to 0, which means no logging. + """ + + request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None + """ + A list of the per-request metadata types to track. Each entry is a tuple + consisting of the string label, the target dtype, and whether to store the data on GPU. + """ diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py index 6cf45aeb9e1..13179483f59 100644 --- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py +++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py @@ -1,36 +1,12 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Optional import torch from megatron.core.inference.batch_dimensions_utils import InferenceBatchDimensions -@dataclass -class MambaInferenceStateConfig: - """ - Config for initializing Mamba model inference state tensors. - - Note that we maintain separate metadata for decode, regular prefill, and - chunked prefill requests because the Mamba kernels do not yet support mixing - these. Once the kernels have been updated we can simplify this code. - """ - - layer_type_list: List[str] - """ - A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. - See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. - """ - - mamba_conv_states_shape: Tuple[int] - """Mamba conv states shape per request.""" - - mamba_ssm_states_shape: Tuple[int] - """Mamba ssm states shape per request.""" - - class MambaMetadata: """Manages the metadata tensors required for Mamba layers during inference.""" diff --git a/megatron/core/inference/contexts/base_context.py b/megatron/core/inference/contexts/base_context.py index 3dfec6de3ad..4f03726fe3d 100644 --- a/megatron/core/inference/contexts/base_context.py +++ b/megatron/core/inference/contexts/base_context.py @@ -2,6 +2,8 @@ import abc +from megatron.core.inference.config import InferenceConfig + class BaseInferenceContext(abc.ABC): """Base class for inference contexts. @@ -10,13 +12,11 @@ class BaseInferenceContext(abc.ABC): Extend this class for any future contexts types. """ - def __init__(self, materialize_only_last_token_logits: bool): + def __init__(self, inference_config: InferenceConfig): """ Args: - materialize_only_last_token_logits (bool): - If True, only the last-token logits will be extracted during decode """ - self.materialize_only_last_token_logits = materialize_only_last_token_logits + self.config = inference_config @abc.abstractmethod def is_static_batching(self) -> bool: diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index 5dc2d503097..915180a5ca2 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -4,22 +4,19 @@ import math import warnings from contextlib import nullcontext -from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple +from typing import List, Optional, Sequence, Tuple -import torch -import torch.nn.functional as F -from packaging.version import Version as PkgVersion -from torch import Tensor +import torch # type: ignore +import torch.nn.functional as F # type: ignore +from torch import Tensor # type: ignore from megatron.core import parallel_state from megatron.core.inference.batch_dimensions_utils import ( CUDAGraphBatchDimensionBuilder, InferenceBatchDimensions, ) +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.inference_request import DynamicInferenceRequest -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.unified_memory import ( UnifiedMemoryUnsupportedError, @@ -28,13 +25,13 @@ from megatron.core.inference.utils import tensor_swap from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb from megatron.core.package_info import __version__ as mcore_version -from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list -from megatron.core.transformer import TransformerConfig +from megatron.core.transformer import MLATransformerConfig, TransformerConfig +from megatron.core.utils import deprecate_args from megatron.core.utils import divide as core_divide -from megatron.core.utils import get_attr_wrapped_model, get_pg_size, internal_api +from megatron.core.utils import get_pg_size, internal_api -from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata +from .attention_context.mamba_metadata import MambaMetadata from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata from .base_context import BaseInferenceContext from .dynamic_block_allocator import BlockAllocator @@ -45,14 +42,7 @@ triton_append_key_value_cache = None try: - from packaging.version import Version as PkgVersion - - HAVE_PACKAGING = True -except: - HAVE_PACKAGING = False - -try: - import flashinfer # pylint: disable=unused-import + import flashinfer # type: ignore # pylint: disable=unused-import HAVE_FLASHINFER = True except ImportError: @@ -66,16 +56,36 @@ except ImportError: HAVE_TORCH_MEMORY_SAVER = False -try: - import wandb # pylint: disable=unused-import - - HAVE_WANDB = True -except ImportError: - HAVE_WANDB = False - wandb = None - -if TYPE_CHECKING: - import wandb as WandbModule +DEPRECATED_ARGS = [ + "params_dtype", + "num_layers", + "kv_channels", + "num_attention_heads", + "max_sequence_length", + "buffer_size_gb", + "paused_buffer_size_gb", + "max_requests", + "max_tokens", + "block_size_tokens", + "tensor_model_parallel_size", + "pipeline_model_parallel_size", + "pg_collection", + "cache_mla_latent", + "kv_lora_rank", + "qk_pos_emb_head_dim", + "num_cuda_graphs", + "materialize_only_last_token_logits", + "mamba_inference_state_config", + "use_cuda_graphs_for_non_decode_steps", + "use_flashinfer_fused_rope", + "unified_memory_level", + "cuda_graph_max_tokens", + "cuda_graph_mixed_prefill_count", + "metrics_writer", + "request_metadata_types", + "persist_cuda_graphs", + "offload_kv_cache", +] class ContextOverflowError(Exception): @@ -213,130 +223,45 @@ class DynamicInferenceContext(BaseInferenceContext): given step, any unassigned blocks equate to unused space. Args: - params_dtype (torch.dtype): Dtype used for KV cache. - num_layers (int): Number of layers on this pipeline parallel rank. - kv_channels (int): Hidden dimension per attention head. - num_attention_heads (int): Number of attention heads. - max_sequence_length (int): Max possible sequence length (prompt + output) - that will occur. - buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache. - if `unified_memory_level` >= 1, then CPU memory is additionally - utilized, resulting in a total buffer size of `buffer_size_gb + - paused_buffer_size_gb`. - paused_buffer_size_gb (float | None): Portion of buffer reserved for - paused requests. Active requests are paused when there are not enough - active blocks available to continue generating a request. The total - buffer size (active + paused) depends on `unified_memory_level` (uvm): - - uvm 0: buffer_size_gb (paused buffer is inclusive) - - uvm 1: buffer_size_gb + paused_buffer_size_gb - max_requests (int): Max number of active requests to use for - decode-only forward passes. This value is primarily limited by the - combination of `buffer_size_gb` and `max_sequence_length`. - max_tokens (int): Max number of tokens to use for forward passes. This is - primarily limited by prefill activation memory usage. (Defaults to - 16384). - block_size_tokens (int): Size of KV cache block size. - tensor_model_parallel_size (Optional[int]): Tensor model parallel size. - num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture, - where the cuda graph batch sizes range from 1 to `max_requests` - (as computed below). Due to rounding, the actual number of cuda graphs - may not equal this argument. - materialize_only_last_token_logits (Optional[bool]): Whether to only - materialize logits for the last token. This should be set to False - if returning log probs. - mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba - inference state config if the model is a hybrid model. - use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode - engine steps. - unified_memory_level (Optional[int]): Set unified memory usage within the - dynamic inference context. The levels are: 0) no unified memory, 1) - allocate `memory_buffer` in unified memory. Eventually, additional - levels will be included to control other tensors within the context. - use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation. - If None, defaults to using flash-infer if available. - metrics_writer (Optional['WandbModule']): Wandb module for writing metrics. - request_metadata_types (Optional[List[Tuple[str, torch.dtype, bool]]]): A list of the - per-request metadata types to track. Each entry is a tuple consisting of the string - label, the target dtype, and whether to store the data on GPU. + model_config (TransformerConfig): Model config. + inference_config (InferenceConfig): Inference config. """ DEFAULT_MAX_TOKENS = 16384 TOKEN_ROUNDER = 64 REQUEST_ROUNDER = 4 - def __init__( - self, - *, - params_dtype: torch.dtype, - num_layers: int, - kv_channels: int, - num_attention_heads: int, - max_sequence_length: int, - buffer_size_gb: float, - paused_buffer_size_gb: float | None = None, - max_requests: int = None, - max_tokens: int = DEFAULT_MAX_TOKENS, - block_size_tokens: int = 256, - tensor_model_parallel_size: Optional[int] = None, - pipeline_model_parallel_size: Optional[int] = None, - pg_collection: Optional[ProcessGroupCollection] = None, - cache_mla_latent: bool = False, - kv_lora_rank: Optional[int] = None, - qk_pos_emb_head_dim: Optional[int] = None, - num_cuda_graphs: Optional[int] = None, - materialize_only_last_token_logits: Optional[bool] = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, - use_cuda_graphs_for_non_decode_steps: bool = True, - use_flashinfer_fused_rope: bool = False, - unified_memory_level: Optional[int] = 0, - cuda_graph_max_tokens: Optional[int] = None, - cuda_graph_mixed_prefill_count: Optional[int] = 16, - metrics_writer: Optional['WandbModule'] = None, - request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None, - persist_cuda_graphs: Optional[bool] = False, - offload_kv_cache: Optional[bool] = False, - ): - super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits) - - self.cache_mla_latent = cache_mla_latent + @deprecate_args( + *DEPRECATED_ARGS, + message=( + "Argument `{name}` has been deprecated. " + "Only pass `model_config` and `inference_config`" + ), + ) + def __init__(self, model_config: TransformerConfig, inference_config: InferenceConfig): + super().__init__(inference_config=inference_config) + + self.cache_mla_latent = ( + isinstance(model_config, MLATransformerConfig) and model_config.cache_mla_latents + ) if self.cache_mla_latent: assert ( - block_size_tokens == 64 + inference_config.block_size_tokens == 64 ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert" - # give deprecated args warning for cuda_graph_max_tokens - if cuda_graph_max_tokens is not None: - warnings.warn( - "`cuda_graph_max_tokens` is deprecated and will be removed in a future release. " - "The context now automatically sets the max tokens for cuda graphs based on " - "`max_requests`.", - DeprecationWarning, - ) - - self.metrics_writer = metrics_writer - # Per partition num heads and hidden size. - projection_size = kv_channels * num_attention_heads - if tensor_model_parallel_size is None: - tp_size = ( - get_pg_size(pg_collection.tp) - if pg_collection is not None - else parallel_state.get_tensor_model_parallel_world_size() - ) + num_attention_heads = model_config.num_query_groups or model_config.num_attention_heads + projection_size = model_config.kv_channels * num_attention_heads + pg_collection = inference_config.pg_collection + if pg_collection is not None: + tp_size = get_pg_size(pg_collection.tp) + pp_size = get_pg_size(pg_collection.pp) else: - tp_size = tensor_model_parallel_size + tp_size = model_config.tensor_model_parallel_size + pp_size = model_config.pipeline_model_parallel_size self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) - if pipeline_model_parallel_size is None: - pp_size = ( - get_pg_size(pg_collection.pp) - if pg_collection is not None - else parallel_state.get_pipeline_model_parallel_world_size() - ) - else: - pp_size = pipeline_model_parallel_size - # Cache the PP group we should use for PP collectives inside the context. # If the model provides a pg_collection with a pp group, prefer it. # Otherwise: @@ -357,6 +282,7 @@ def __init__( self.expert_model_parallel_group = None # Mamba states. + mamba_inference_state_config = inference_config.mamba_inference_state_config self.is_hybrid_model = mamba_inference_state_config is not None if self.is_hybrid_model: mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape @@ -381,7 +307,7 @@ def __init__( self.layer_map = attention_layer_map | mamba_layer_map else: # The layer map is the identity function for pure Transformer models. - self.num_attention_layers = num_layers + self.num_attention_layers = model_config.num_layers // pp_size self.num_mamba_layers = 0 (self.mamba_conv_states_shape, self.mamba_ssm_states_shape) = (None, None) self.layer_map = {i: i for i in range(self.num_attention_layers)} @@ -392,11 +318,11 @@ def __init__( ) # Block size tokens, bytes. - dtype_size_bytes = params_dtype.itemsize - self.block_size_tokens = block_size_tokens + dtype_size_bytes = model_config.params_dtype.itemsize + self.block_size_tokens = inference_config.block_size_tokens if self.cache_mla_latent: # one vector c_t (rank) + optional RoPE phase slice - self.kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim + self.kv_reduced_dim = model_config.kv_lora_rank + model_config.qk_pos_emb_head_dim self.block_size_bytes = ( dtype_size_bytes * self.num_attention_layers @@ -422,9 +348,9 @@ def __init__( mamba_states_memory_per_request *= dtype_size_bytes # Unified memory. - self.unified_memory_level = unified_memory_level - self.persist_cuda_graphs = persist_cuda_graphs - if unified_memory_level > 0: + self.unified_memory_level = inference_config.unified_memory_level + self.persist_cuda_graphs = inference_config.persist_cuda_graphs + if self.unified_memory_level > 0: try: self.unified_memory_mempool = create_unified_mempool() except UnifiedMemoryUnsupportedError: @@ -435,9 +361,11 @@ def __init__( self.unified_memory_level = 0 # Initialize block allocator. - buffer_size_bytes = int(buffer_size_gb * 1024**3) + buffer_size_bytes = int(inference_config.buffer_size_gb * 1024**3) paused_buffer_size_bytes = ( - 0 if paused_buffer_size_gb is None else int(paused_buffer_size_gb * 1024**3) + 0 + if inference_config.paused_buffer_size_gb is None + else int(inference_config.paused_buffer_size_gb * 1024**3) ) # TODO: Add parameter to control fraction of memory assigned to KV cache # versus Mamba state. @@ -471,13 +399,14 @@ def __init__( ) # Track request metadata. + request_metadata_types = inference_config.request_metadata_types if request_metadata_types is None: request_metadata_types = DynamicInferenceRequest.get_metadata_types() self.request_metadata_types = request_metadata_types # Initialize context state. - self.params_dtype = params_dtype - self.max_sequence_length = max_sequence_length + self.params_dtype = model_config.params_dtype + self.max_sequence_length = inference_config.max_sequence_length # Request and token counts. self.total_request_count = 0 @@ -497,16 +426,16 @@ def __init__( self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens) # Set max_requests, max_tokens. - if max_requests is None: + if inference_config.max_requests is None: # Maximize compute utilization by defaulting to 1 block per request. self.max_requests = self.block_allocator.total_count - 1 # -1 for dummy block self.max_requests = self.max_requests // tp_size * tp_size self.max_requests = self.max_requests // self.REQUEST_ROUNDER * self.REQUEST_ROUNDER else: # User can control request overflow via max_requests. - self.max_requests = max_requests + self.max_requests = inference_config.max_requests - self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS + self.max_tokens = inference_config.max_tokens or self.DEFAULT_MAX_TOKENS assert self.max_tokens >= self.max_requests, ( f"max_tokens ({self.max_tokens}) must be >= " @@ -538,37 +467,39 @@ def __init__( ) # CUDA graph config list + self.use_cuda_graphs_for_non_decode_steps = ( + inference_config.use_cuda_graphs_for_non_decode_steps + ) self.cuda_graph_batch_dimensions_list, self.cuda_graph_token_counts = ( CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list( tp_size=tp_size, - num_cuda_graphs=num_cuda_graphs, + num_cuda_graphs=inference_config.num_cuda_graphs, cuda_graph_max_tokens=self.max_requests, - cuda_graph_mixed_prefill_count=cuda_graph_mixed_prefill_count, + cuda_graph_mixed_prefill_count=inference_config.cuda_graph_mixed_prefill_count, max_requests=self.max_requests, max_tokens=self.max_tokens, max_sequence_length=self.max_sequence_length, - use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps, + use_cuda_graphs_for_non_decode_steps=self.use_cuda_graphs_for_non_decode_steps, ) ) # Whether to offload the KV cache. Determines where the KV cache is allocated within memory. - self.offload_kv_cache = offload_kv_cache + self.offload_kv_cache = inference_config.offload_kv_cache assert not ( self.offload_kv_cache and self.unified_memory_level ), "The KV cache should not be instantiated in unified memory when it is offloaded during training." self._using_cuda_graph_this_step = False - self.use_cuda_graphs_for_non_decode_steps = use_cuda_graphs_for_non_decode_steps # Deal with chunked prefill self.chunked_prefill_request_id = -1 self.has_explicit_chunked_prefill_req = False # FlashInfer. - if use_flashinfer_fused_rope is True: + if inference_config.use_flashinfer_fused_rope is True: assert HAVE_FLASHINFER, "flashinfer is not installed" - elif use_flashinfer_fused_rope is None: - use_flashinfer_fused_rope = HAVE_FLASHINFER - self.use_flashinfer_fused_rope = use_flashinfer_fused_rope + elif inference_config.use_flashinfer_fused_rope is None: + inference_config.use_flashinfer_fused_rope = HAVE_FLASHINFER + self.use_flashinfer_fused_rope = inference_config.use_flashinfer_fused_rope # Allocate GPU state. self.is_tensor_state_allocated = False @@ -756,14 +687,7 @@ def deallocate_all_tensors(self): @classmethod def round_up_tokens(cls, value, tp_size=None): - """Round up to nearest multiple of `TOKEN_ROUNDER` (above) that is also divisible by tensor model parallel size.""" - if not HAVE_PACKAGING: - raise ImportError( - "`packaging` is required for this functionality, please install it with `pip install packaging`" - ) - if PkgVersion(mcore_version) < PkgVersion("0.13"): - return cls.round_up(value) - + """Round up to nearest multiple of `TOKEN_ROUNDER` that is also divisible by tensor model parallel size.""" # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -775,72 +699,9 @@ def round_up_tokens(cls, value, tp_size=None): return token_rounder * int(math.ceil(int(value) / token_rounder)) - @classmethod - def from_config( - cls, - inference_config: InferenceWrapperConfig, - model, - max_batch_size: int, - buffer_size_gb: float = 40, - num_cuda_graphs: int = None, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, - unified_memory_level: int = 0, - ): - """ - Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`. - """ - # TODO: Add other necessary configs from inference_config - - # Max sequence length. - position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") - model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") - inf_max_seq_len = inference_config.inference_max_seq_length - - if position_embedding_type == "learned_absolute": - # When using absolute position embeddings, it is critical that the - # context's `max_sequence_length` is less than or equal to the model's - # `max_sequence_length`. Otherwise, the context's `position_ids` will - # contain ids greater than the dimension of the position embedding - # tensor, which will result in an index error. - if inf_max_seq_len: - max_sequence_length = min(model_max_seq_len, inf_max_seq_len) - else: - max_sequence_length = model_max_seq_len - assert max_batch_size <= model_max_seq_len - else: - max_sequence_length = ( - inference_config.inference_max_seq_length or model_config.max_sequence_length - ) - max_sequence_length = max(max_sequence_length, max_batch_size) - - # Context. - model_config = model.config - return cls( - params_dtype=inference_config.params_dtype, - num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size, - kv_channels=model_config.kv_channels, - num_attention_heads=model_config.num_query_groups, - tensor_model_parallel_size=model_config.tensor_model_parallel_size, - pipeline_model_parallel_size=model_config.pipeline_model_parallel_size, - max_sequence_length=max_sequence_length, - buffer_size_gb=buffer_size_gb, - materialize_only_last_token_logits=False, - num_cuda_graphs=num_cuda_graphs, - use_flashinfer_fused_rope=None, - mamba_inference_state_config=mamba_inference_state_config, - unified_memory_level=unified_memory_level, - ) - @classmethod def round_up_requests(cls, value, tp_size=None): - """Round up to nearest multiple of `REQUEST_ROUNDER` (above) that is also divisible by tensor model parallel size.""" - if not HAVE_PACKAGING: - raise ImportError( - "`packaging` is required for this functionality, please install it with `pip install packaging`" - ) - if PkgVersion(mcore_version) < PkgVersion("0.13"): - return cls.round_up(value) - + """Round up to nearest multiple of `REQUEST_ROUNDER` that is also divisible by tensor model parallel size.""" # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -852,16 +713,6 @@ def round_up_requests(cls, value, tp_size=None): return request_rounder * int(math.ceil(int(value) / request_rounder)) - @classmethod - def round_up(cls, value): - """Deprecated in favor of round_up_tokens and round_up_requests.""" - warnings.warn( - "`round_up` is deprecated in favor of `round_up_tokens` or `round_up_requests` " - "and will be removed in `megatron-core` 0.14." - ) - ROUNDER = getattr(cls, "ROUNDER", 64) - return ROUNDER * int(math.ceil(int(value) / ROUNDER)) - def is_static_batching(self) -> bool: """Is static batching? False.""" return False @@ -882,6 +733,7 @@ def has_unfinished_requests(self) -> bool: def cu_query_lengths(self) -> Tuple[Tensor, int]: """Cumulative query sequence lengths.""" + assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"], @@ -889,6 +741,7 @@ def cu_query_lengths(self) -> Tuple[Tensor, int]: def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]: """Cumulative key/value sequence lengths.""" + assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"], @@ -958,18 +811,20 @@ def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) : self.padded_active_token_count ] - def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]: + def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional[Tensor], Tensor]: """Read from KV cache. Args: layer_number (int): Layer number. Return: - (Tuple[Tensor, Tensor]) The key and value pointer tensors that point - to blocks within the block-level memory buffer. + (Tuple[Tensor, Tensor, Tensor]) The key and value pointer tensors that point + to blocks within the block-level memory buffer as well as the block table. """ attention_layer_number = self.layer_map[layer_number - 1] + assert self.active_attn_metadata is not None + if self.cache_mla_latent: return ( self.memory_buffer[attention_layer_number], @@ -1386,9 +1241,9 @@ def initialize_attention_state( ] = 0 self.active_attn_metadata = ( - self.graph_attn_metadata + self.graph_attn_metadata # type: ignore[assignment] if self.using_cuda_graph_this_step() - else self.non_graph_attn_metadata + else self.non_graph_attn_metadata # type: ignore[assignment] ) # Update cu_query_seq_lengths, max_seqlen_q. @@ -1413,6 +1268,7 @@ def initialize_attention_state( has_explicit_chunked_prefill_req=False, ) + assert self.active_attn_metadata is not None self.active_attn_metadata["mha_metadata"].update( request_query_lengths=query_lengths_view, request_kv_length_offsets=request_kv_length_offsets_view, @@ -1545,7 +1401,7 @@ def last_token_logits(self, logits: Tensor) -> Tensor: return last_token_logits - def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool): + def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bool, bool, bool]: """ Check if the request can be added to the context. """ @@ -1784,7 +1640,7 @@ def resume_paused_requests( active_request_count: int, newly_paused_request_ids: torch.Tensor, next_tokens: torch.Tensor, - ) -> tuple[int, int, torch.Tensor]: + ) -> tuple[int, torch.Tensor]: """Resume as many paused requests as we have space for in the active buffer. Args: @@ -1863,7 +1719,7 @@ def resume_paused_requests( def evict_overflow_paused_requests( self, active_request_count: int, next_tokens: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Optional[tuple[torch.Tensor, torch.Tensor]]: """Evict requests that overflow the paused buffer. Args: diff --git a/megatron/core/inference/contexts/static_context.py b/megatron/core/inference/contexts/static_context.py index 8c83d2f09b3..a15b33c414a 100644 --- a/megatron/core/inference/contexts/static_context.py +++ b/megatron/core/inference/contexts/static_context.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) +from megatron.core.inference.config import InferenceConfig from .base_context import BaseInferenceContext @@ -19,7 +17,8 @@ class StaticInferenceContext(BaseInferenceContext): def __init__( self, max_batch_size: int, max_sequence_length: int, use_flashinfer_fused_rope: bool = None ): - super().__init__(materialize_only_last_token_logits=True) + config = InferenceConfig(materialize_only_last_token_logits=True) + super().__init__(inference_config=config) self.max_sequence_length = max_sequence_length self.max_batch_size = max_batch_size self.sequence_len_offset = 0 @@ -27,13 +26,6 @@ def __init__( self.key_value_memory_dict = {} self.decode_mode = False - @classmethod - def from_config(cls, config: InferenceWrapperConfig) -> "StaticInferenceContext": - """Initialize context from a config.""" - max_batch_size = config.inference_max_requests - max_sequence_length = config.inference_max_seq_length - return cls(max_batch_size, max_sequence_length) - def swap_key_value_dict(self, batch_idx): "swap between batches" if len(self.key_value_memory_dict) == 0: diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 0a95e8f4a53..882db6b3a6a 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -42,6 +42,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import delete_cuda_graphs from megatron.core.utils import ( + deprecate_args, experimental_api, get_asyncio_loop, get_pg_rank, @@ -89,6 +90,14 @@ except ImportError: HAVE_PSUTIL = False +DEPRECATED_ARGS = [ + "enable_cuda_graph", + "random_seed", + "track_paused_request_events", + "enable_chunked_prefill", + "inference_logging_step_interval", + "pg_collection", +] from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER if HAVE_TORCH_MEMORY_SAVER: @@ -136,24 +145,13 @@ class DynamicInferenceEngine(AbstractEngine): outputs and detokenizer the output tokens. inference_context (DynamicInferenceContext): Context for managing in-flight batching and a dynamic block-level KV cache (similar to paged attention). - random_seed (Optional[int]): Use a random seed if you want deterministic - results. Defaults to None. - inference_logging_step_interval (int): The step interval at which to log - inference metrics to wandb. Defaults to 0, which means no logging. """ - def __init__( - self, - controller: TextGenerationController, - context: DynamicInferenceContext, - enable_cuda_graph: Optional[bool] = None, - random_seed: Optional[int] = None, - *, - track_paused_request_events: bool = False, - enable_chunked_prefill: bool = True, - inference_logging_step_interval: int = 0, - pg_collection: Optional[ProcessGroupCollection] = None, - ): + @deprecate_args( + *DEPRECATED_ARGS, + message="Argument `{name}` has been deprecated. Only pass `controller` and `context`", + ) + def __init__(self, controller: TextGenerationController, context: DynamicInferenceContext): assert isinstance( controller, TextGenerationController @@ -161,40 +159,28 @@ def __init__( assert isinstance( context, DynamicInferenceContext ), f"context must be a DynamicInferenceContext, got {type(context)}" - assert isinstance(random_seed, int), f"random_seed must be an int, got {type(random_seed)}" - - # Deprecate `enable_cuda_graph`. - if enable_cuda_graph is not None: - warnings.warn( - "The `enable_cuda_graph` argument is deprecated and will be " - "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " - "read directly from the transformer config object." - ) - self.enable_cuda_graph = enable_cuda_graph - else: - self.enable_cuda_graph = ( - controller.inference_wrapped_model.model.config.enable_cuda_graph - ) - if pg_collection is not None: - self.pg_collection = pg_collection + model_config = controller.inference_wrapped_model.model.config + inference_config = context.config + + if inference_config.pg_collection is not None: + self.pg_collection = inference_config.pg_collection else: self.pg_collection = ProcessGroupCollection.use_mpu_process_groups() # Initialization options. self.controller = controller self.context = context - self.random_seed = random_seed - self.track_paused_request_events = track_paused_request_events - self.enable_chunked_prefill = enable_chunked_prefill - self.inference_logging_step_interval = inference_logging_step_interval - self.unified_memory_level = context.unified_memory_level - self.persist_cuda_graphs = context.persist_cuda_graphs - - if enable_cuda_graph is not None: - self.cuda_graph_impl = "local" if enable_cuda_graph else "none" - else: - self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl + self.track_paused_request_events = inference_config.track_paused_request_events + self.enable_chunked_prefill = inference_config.enable_chunked_prefill + self.metrics_writer = inference_config.metrics_writer + self.logging_step_interval = inference_config.logging_step_interval + self.unified_memory_level = inference_config.unified_memory_level + self.persist_cuda_graphs = inference_config.persist_cuda_graphs + self.materialize_only_last_token_logits = ( + inference_config.materialize_only_last_token_logits + ) + self.cuda_graph_impl = model_config.cuda_graph_impl # Initialize engine. self.reset() @@ -205,12 +191,12 @@ def __init__( ) # Configure wandb to use separate step counter for inference metrics (only once) - if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None: + if self.logging_step_interval > 0 and self.metrics_writer is not None: logging.info( f"\033[1;93m[INFERENCE]\033[0m " f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m" ) - if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": + if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": # Make all inference/* metrics use inference_step as their x-axis # This allows inference and training to have independent step counters context.metrics_writer.define_metric( @@ -288,8 +274,6 @@ def create_cuda_graphs(self, reset_context: bool = True): context = self.context controller = self.controller - config = controller.inference_wrapped_model.inference_wrapper_config - time_start = time.time() mem_stats_start = torch.cuda.memory_stats() @@ -733,7 +717,7 @@ def _add_request( request.sampling_params.return_log_probs and not request.sampling_params.skip_prompt_log_probs ): - assert not self.context.materialize_only_last_token_logits, ( + assert not self.materialize_only_last_token_logits, ( "Prompt log probs cannot be calculated if only last token logits are materialized. " "Set materialize_only_last_token_logits to False in DynamicInferenceContext " "or skip_prompt_log_probs to True in SamplingParams." @@ -922,7 +906,7 @@ def post_process_requests( # For chunked prefill with materialize_only_last_token_logits, discard intermediate log probs if ( request_id == self.context.chunked_prefill_request_id - and self.context.materialize_only_last_token_logits + and self.materialize_only_last_token_logits ): request.prompt_log_probs = [] request.generated_log_probs = [] @@ -1202,10 +1186,10 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]: range_pop() if ( - self.inference_logging_step_interval > 0 + self.logging_step_interval > 0 and self.step_count > 0 - and self.step_count % self.inference_logging_step_interval == 0 - and self.context.metrics_writer is not None + and self.step_count % self.logging_step_interval == 0 + and self.metrics_writer is not None ): kvcache_util_stats = self.context.get_kvcache_utilization_stats() else: @@ -1338,18 +1322,13 @@ async def async_bookkeep( else: metrics[f'inference/{key}'] = value - if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": - self.context.metrics_writer.log(metrics, commit=True) + if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": + self.metrics_writer.log(metrics, commit=True) else: - raise ValueError( - f"Unsupported metrics writer type: {type(self.context.metrics_writer)}" - ) + raise ValueError(f"Unsupported metrics writer type: {type(self.metrics_writer)}") # Print context state. - if ( - self.inference_logging_step_interval > 0 - and step_count % self.inference_logging_step_interval == 0 - ): + if self.logging_step_interval > 0 and step_count % self.logging_step_interval == 0: mem = torch.cuda.memory_stats() step_type = "decode" if context_state["is_decode_only"] else "non-decode" output_str = ( diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py index d4c61965d2b..5ae37d5967e 100644 --- a/megatron/core/inference/engines/static_engine.py +++ b/megatron/core/inference/engines/static_engine.py @@ -8,7 +8,8 @@ import torch from megatron.core.inference.async_stream import AsyncStream -from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.inference_request import InferenceRequest @@ -17,7 +18,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model +from megatron.core.utils import get_asyncio_loop try: from tqdm import tqdm @@ -42,8 +43,6 @@ class StaticInferenceEngine(AbstractEngine): controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. max_batch_size (int, optional): The maximum number of requests to process at once. - Will be set from the InferenceWrapperConfig in `text_generation_controller` by - default. random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None. """ @@ -69,53 +68,55 @@ def __init__( DeprecationWarning, ) - inference_wrapper_config = ( - text_generation_controller.inference_wrapped_model.inference_wrapper_config - ) self.controller = text_generation_controller + self.inference_wrapped_model = self.controller.inference_wrapped_model + self.config = self.inference_wrapped_model.config self.random_seed = random_seed or 1234 - inference_max_batch_size = inference_wrapper_config.inference_max_requests + # Store original context in case we need to fall back to legacy static engine + original_context = self.inference_wrapped_model.inference_context + assert original_context is not None + assert isinstance(original_context, StaticInferenceContext) + if max_batch_size is None: - max_batch_size = inference_max_batch_size - elif max_batch_size > inference_max_batch_size: + max_batch_size = original_context.max_batch_size + elif max_batch_size > original_context.max_batch_size: warnings.warn( f"Engine `max_batch_size` ({max_batch_size}) > " - f"`inference_max_requests` in `inference_wrapper_config` " - f"({inference_max_batch_size}); setting `max_batch_size` to " - f"{inference_max_batch_size}", + f"`context.max_batch_size` in `inference_wrapped_model.inference_context` " + f"({original_context.max_batch_size}); setting `max_batch_size` to " + f"{original_context.max_batch_size}", UserWarning, ) - max_batch_size = inference_max_batch_size + max_batch_size = original_context.max_batch_size self.scheduler = Scheduler(max_batch_size=max_batch_size) - # Store original context in case we need to fall back to legacy static engine - original_context = text_generation_controller.inference_wrapped_model.inference_context - - mamba_inference_state_config = get_mamba_inference_state_config_from_model( - text_generation_controller.inference_wrapped_model.model + mamba_inference_state_config = MambaInferenceStateConfig.from_model( + self.inference_wrapped_model.model ) try: if not legacy: - dynamic_context = DynamicInferenceContext.from_config( - inference_config=inference_wrapper_config, - model=text_generation_controller.inference_wrapped_model.model, - max_batch_size=max_batch_size, - buffer_size_gb=buffer_size_gb, - num_cuda_graphs=1, - mamba_inference_state_config=mamba_inference_state_config, + dynamic_context = DynamicInferenceContext( + model_config=self.config, + inference_config=InferenceConfig( + max_sequence_length=original_context.max_sequence_length, + buffer_size_gb=buffer_size_gb, + mamba_inference_state_config=mamba_inference_state_config, + max_requests=max_batch_size, + num_cuda_graphs=1, + block_size_tokens=256, + unified_memory_level=0, + ), ) + self.controller.inference_wrapped_model.inference_context = dynamic_context self.controller.inference_wrapped_model.prep_model_for_inference() self.controller._init_dynamic_sampling_tensors() self.dynamic_engine = DynamicInferenceEngine( - controller=self.controller, - random_seed=self.random_seed, - context=dynamic_context, - enable_cuda_graph=True, + controller=self.controller, context=dynamic_context ) except Exception as e: # Get exception details for better debugging @@ -229,13 +230,20 @@ def generate_using_dynamic_engine( if prompts: if add_BOS: sampling_params.add_BOS = True - return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) + request_records = self.dynamic_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) elif inference_requests: prompts = [request.prompt for request in inference_requests] sampling_params = inference_requests[0].sampling_params if add_BOS: sampling_params.add_BOS = True - return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) + request_records = self.dynamic_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) + + # Return the underlying `InferenceRequest` objects from the `DynamicInferenceRequestRecord`s. + return [record.merge() for record in request_records] def generate_using_legacy_static_engine( self, diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py index 6a17de685bf..6ef5ac3a2e5 100644 --- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import abc -import math -import warnings from typing import Any, Dict, Iterable, Optional, Union import torch @@ -15,27 +13,22 @@ send_to_next_pipeline_rank, ) from megatron.core.inference.contexts import BaseInferenceContext -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.utils import get_attr_wrapped_model, get_model_config -# pylint: disable=line-too-long class AbstractModelInferenceWrapper(abc.ABC): """Abstract inference wrapper Extend this to create a version for your model. - The wrapper prepares the model for inference, provides the required input data and runs the forward pass. + The wrapper prepares the model for inference, provides the required input data and + runs the forward pass. Args: model (Union[GPTModel, LegacyGPTModel]): The actual GPT model (MCore or MLM). - inference_wrapper_config (InferenceWrapperConfig): Has info like - hidden size, vocab size etc. inference_context (BaseInferenceContext): Context for managing KV cache and other inference params. pg_collection (ProcessGroupCollection): Process groups for model communication. @@ -44,30 +37,18 @@ class AbstractModelInferenceWrapper(abc.ABC): def __init__( self, model: Union['LegacyGPTModel', GPTModel], # type: ignore[name-defined] - inference_wrapper_config: InferenceWrapperConfig, - inference_context: Optional[BaseInferenceContext] = None, + inference_context: BaseInferenceContext, pg_collection: Optional[ProcessGroupCollection] = None, ): assert not isinstance( model, Iterable ), 'interleaving schedule is not supported for inference' self.model = model - self.inference_wrapper_config = inference_wrapper_config + self.config = get_model_config(self.model) self.pipeline_communication_dtype = ( - torch.float - if self.inference_wrapper_config.fp32_residual_connection - else self.inference_wrapper_config.params_dtype + torch.float if self.config.fp32_residual_connection else self.config.params_dtype ) - model_config = get_model_config(self.model) - self.sequence_parallel = model_config.sequence_parallel - - if inference_context is None: - warnings.warn( - "`inference_context` must be passed in as an argument starting in `megatron-core` 0.13." - ) - from megatron.core.inference.contexts import StaticInferenceContext - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + self.sequence_parallel = self.config.sequence_parallel self.inference_context = inference_context @@ -78,40 +59,18 @@ def __init__( self.pp_group = pg_collection.pp self.tp_size = torch.distributed.get_world_size(self.tp_group) - if self.inference_wrapper_config.fp8 is not None: + if self.config.fp8 is not None: self.model = prepare_model_for_fp8_inference(self.model) - @property - def inference_params(self): - """Getter for deprecated `inference_params`.""" - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." - ) - return self.inference_context + # TODO(ksanthanam): Add support for fp4 - @inference_params.setter - def inference_params(self, value): - """Setter for deprecated `inference_params`.""" - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." - ) - self.inference_context = value - - def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tensor] = None): + def prep_model_for_inference(self): """A utility function for preparing model for inference The function gets called once before the auto regressive inference loop. It puts the model in eval mode. - Args: - prompts_tokens (torch.Tensor, optional): Deprecated, will be removed in `megatron-core` 0.13 """ - if prompts_tokens is not None: - warnings.warn( - "Passing `prompts_tokens` is deprecated and this argument will be ignored." - "This parameter will be removed in `megatron-core` 0.13." - ) - self.model.eval() # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True @@ -137,7 +96,9 @@ def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]: def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, Any]: """Returns the input data for inference - This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. + This function gets called iteratively in the inference loop. + It can be used to extract relevant input from the prompt tokens, attention mask etc. + required for each step in inference. """ raise NotImplementedError() @@ -183,15 +144,16 @@ def _get_batch_size_and_seq_len( self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None ): """ - Returns the batch size and sequence length based on the tokens tensor and recv_buffer_seq_len. + Returns the batch size and sequence length based on the tokens tensor and + recv_buffer_seq_len. Args: tokens (torch.Tensor): The input tensor of shape (batch_size, seq_len). recv_buffer_seq_len (int, optional): An optional recv buffer sequence length. Returns: - tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of tokens - and seq_len is either the second dimension or recv_buffer_seq_len. + tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of + tokens and seq_len is either the second dimension or recv_buffer_seq_len. """ batch_size = tokens.shape[0] seq_len = recv_buffer_seq_len if recv_buffer_seq_len is not None else tokens.shape[1] @@ -204,7 +166,7 @@ def _allocate_recv_buffer(self, batch_size, seq_len): # sequence parallelism. Static batching does not support sequence parallelism # except for the MoE layers which is handled separately. seq_len = seq_len // self.tp_size - recv_size = (seq_len, batch_size, self.inference_wrapper_config.hidden_size) + recv_size = (seq_len, batch_size, self.config.hidden_size) return torch.empty( recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device() ) @@ -214,10 +176,12 @@ def forward_pass_without_pipeline_parallel( ) -> torch.Tensor: """Utility to carry out simple forward pass for TP or no model parallel models - Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism. + Runs a very simple forward pass for model. Used in the case of models without any + parallelism or only tensor parallelism. Args: - inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] + inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model + [tokens, position ids, attention mask] Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -228,16 +192,18 @@ def forward_pass_without_pipeline_parallel( return logits - def forward_pass_with_pipeline_parallel_small_input_batch( + def forward_pass_with_pipeline_parallel( self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None ) -> torch.Tensor: - """Utility to carry out forward pass for PP models with very small inputs + """Utility to carry out forward pass for PP models - If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method + TODO: Add support for asynchronous microbatches Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model + [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel + recv buffer. Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -268,98 +234,8 @@ def forward_pass_with_pipeline_parallel_small_input_batch( logits = output_tensor # Explicitly cast logits to expected dtype - logits = logits.to(self.inference_wrapper_config.params_dtype) - - return logits - - def forward_pass_with_pipeline_parallel_large_input_batch( - self, inference_input: Dict[str, Any], recv_buffer_seq_len=None - ) -> torch.Tensor: - """Utility to carry out forward pass PP models. - - Runs the forward pass for models which are pipeline parallel. - This is more complex than forward_pass_with_pipeline_parallel_small_input_batch because - this splits the global batch into small micro batches and runs them through the model. - - Args: - inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. - - Returns: - torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] - """ - tokens = inference_input["tokens"] - position_ids = inference_input["position_ids"] - attention_mask = inference_input["attention_mask"] - materialize_only_last_token_logits = ( - self.inference_context.materialize_only_last_token_logits - ) - - micro_batch_size = max( - 1, - self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1), - ) - batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len) - # Round up to account for the last partial micro batch if present - num_micro_batches = math.ceil(batch_size / micro_batch_size) - - logits = None - # Preallocate memory for output logits. - if is_pipeline_last_stage(self.pp_group): - logits_seq_len = 1 if materialize_only_last_token_logits else seq_len - logits = torch.empty( - (batch_size, logits_seq_len, self.inference_wrapper_config.padded_vocab_size), - dtype=self.pipeline_communication_dtype, - device=torch.cuda.current_device(), - ) - - recv_buffer = None - if not is_pipeline_first_stage(self.pp_group): - recv_buffer = self._allocate_recv_buffer(micro_batch_size, seq_len) - for micro_batch_index in range(num_micro_batches): - start = micro_batch_index * micro_batch_size - end = min(start + micro_batch_size, batch_size) - tokens2use = tokens[start:end, ...] - position_ids2use = position_ids[start:end, ...] - current_micro_batch_size = end - start - - # Need to change recv buffer shape for the last partial microbatch (if exists) - if current_micro_batch_size != micro_batch_size: - recv_buffer = self._allocate_recv_buffer(current_micro_batch_size, seq_len) - - if not is_pipeline_first_stage(self.pp_group): - recv_from_prev_pipeline_rank_(recv_buffer, self.pp_group) - - self.model.set_input_tensor(recv_buffer) - - output_tensor = self._forward( - { - "tokens": tokens2use, - "position_ids": position_ids2use, - "attention_mask": attention_mask, - "inference_context": self.inference_context, - } - ) - - if not is_pipeline_last_stage(self.pp_group): - send_to_next_pipeline_rank(output_tensor, self.pp_group) - - self.inference_context.batch_size_offset += current_micro_batch_size - - if is_pipeline_last_stage(self.pp_group): - assert logits is not None - logits[start:end, ...] = output_tensor - - # Explicitly cast logits to expected dtype - if is_pipeline_last_stage(self.pp_group): - assert logits is not None - logits = logits.to(self.inference_wrapper_config.params_dtype) - - # Once done with all micro batches, we reset batch size offset and seq len offset - self.inference_context.increment_sequence_len_offset(seq_len) - self.inference_context.reset_batch_size_offset() + logits = logits.to(self.config.params_dtype) - # NOTE: Only returns the logits on the last pipeline stage return logits @torch.inference_mode() @@ -368,14 +244,18 @@ def run_one_forward_step( ) -> torch.Tensor: """The forward pass of the model for inference - Appropriate utility is called for the forward pass depending on the type of model parallelism used + Appropriate utility is called for the forward pass depending on the type of model + parallelism used Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model + [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel + recv buffer. Returns: - torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models. + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. + The logits are returned only in the last pipeline stage for PP models. """ # Check if we are in a PP model if not (is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group)): @@ -383,19 +263,6 @@ def run_one_forward_step( current_batch_size, seq_len = self._get_batch_size_and_seq_len( tokens, recv_buffer_seq_len ) - # If input batch is large, we need to split into micro batches and run the forward pass - if ( - current_batch_size * seq_len - > self.inference_wrapper_config.inference_batch_times_seqlen_threshold - and self.inference_wrapper_config.inference_batch_times_seqlen_threshold != -1 - ): - return self.forward_pass_with_pipeline_parallel_large_input_batch( - inference_input, recv_buffer_seq_len - ) - else: - # If input batch is very small we can do a simple forward pass on the entire global batch - return self.forward_pass_with_pipeline_parallel_small_input_batch( - inference_input, recv_buffer_seq_len - ) + return self.forward_pass_with_pipeline_parallel(inference_input, recv_buffer_seq_len) else: return self.forward_pass_without_pipeline_parallel(inference_input) diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py index ba89fbc2f6c..2a1f10daa1e 100644 --- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py @@ -7,9 +7,6 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.utils import get_attention_mask from megatron.core.models.gpt import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection @@ -25,8 +22,6 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper): Args: model (GPTModel): The GPT model (MCore or legacy) - inference_wrapper_config (InferenceWrapperConfig): Has info like hidden size, vocab - size, etc. inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. pg_collection (ProcessGroupCollection): Process groups for model communication. @@ -36,11 +31,13 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper): def __init__( self, model: GPTModel, - inference_wrapper_config: InferenceWrapperConfig, inference_context: Optional[BaseInferenceContext] = None, pg_collection: Optional[ProcessGroupCollection] = None, + inference_wrapper_config: Optional[Any] = None, # Deprecated ): - super().__init__(model, inference_wrapper_config, inference_context, pg_collection) + if inference_wrapper_config is not None: + raise TypeError("Passing `inference_wrapper_config` is deprecated.") + super().__init__(model, inference_context, pg_collection) def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]: """Prepares the inference input data. diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py deleted file mode 100644 index 5d89085add2..00000000000 --- a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from dataclasses import dataclass -from typing import Optional - -import torch - - -@dataclass -class InferenceWrapperConfig: - """Config for the model inference wrapper - - NOTE : All the arguments here are obtained from arguments.py file - """ - - hidden_size: int - """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]""" - - params_dtype: torch.dtype - """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used""" - - inference_batch_times_seqlen_threshold: int - """if (batch-size * sequence-length) is smaller than this threshold then we will not pipeline - the batch.""" - - padded_vocab_size: int - """The final padded vocab size (Padded to make it divisible by - --make-vocab-size-divisible-by value)""" - - inference_max_requests: int = 8 - """ Maximum number of requests for inference (prefill & decode). Necessary for CUDA graphs. """ - - inference_max_seq_length: int = 2560 - """ Maximum sequence length for inference (prefill & decode). Necessary for CUDA graphs. """ - - fp32_residual_connection: bool = False - """Move residual connections to fp32. Obtained from arguments.py""" - - nccl_all_reduce_for_prefill: bool = False - """When using symmetric all reduce kernels we keep the default all reduces for nccl. - This can be more effecient for large prefill sizes""" - - fp8: Optional[str] = None - """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined - choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 - activation and weight tensors and e5m2 for all FP8 output activation gradient tensors.""" - - moe_pad_experts_for_cuda_graph_inference: bool = False - """Some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router - will switch to dropping and padding during decode time which does not have a D2H sync. The - capacity factor is set to the max that an expert could see during inference so no tokens are - actually dropped. """ - - def add_attributes(self, attribute_value_pair: dict): - """Utility to add more attributes to inference params - - Use this method to pass in a custom dictionary to add more configs to the instance created. - Use as follows: - c = InferenceWrapperConfig - c.add_attributes({'precision':'fp32'}) - - Args: - attribute_value_pair (dict): A dictionary containing attributes as the key names and - corresponding values. - """ - for key, value in attribute_value_pair.items(): - setattr(self, key, value) diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py index 2ae1e2ade6f..c773ab507a3 100644 --- a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py @@ -11,9 +11,6 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.T5 import T5Model from megatron.core.utils import get_attr_wrapped_model @@ -27,7 +24,6 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): Args: model (T5Model): The T5 model (MCore or legacy) - inference_wrapper_config (InferenceWrapperConfig): The command line arguments that were passed inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. use_local (bool): Whether the T5 model's transformer impl @@ -37,11 +33,10 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): def __init__( self, model: T5Model, - inference_wrapper_config: InferenceWrapperConfig, inference_context: Optional[BaseInferenceContext] = None, use_local: bool = False, ): - super().__init__(model, inference_wrapper_config, inference_context) + super().__init__(model, inference_context) self.use_local = use_local def prep_inference_input( diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py deleted file mode 100644 index 340cadb48a9..00000000000 --- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import - TextGenerationController as SimpleTextGenerationController, -) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index a5233983ed0..617883414d4 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -11,21 +11,22 @@ import torch import torch.nn.functional as F from torch import Tensor -from torch.distributed import ProcessGroup +from megatron.core import parallel_state from megatron.core.inference.async_stream import AsyncStream from megatron.core.inference.communication_utils import ( broadcast_from_last_pipeline_stage, - is_pipeline_first_stage, is_pipeline_last_stage, ) from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError +from megatron.core.inference.contexts.static_context import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding +from megatron.core.models.multimodal.llava_model import LLaVAModel from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel @@ -52,28 +53,32 @@ class TextGenerationController: inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts - pp_group (ProcessGroup): Process group for pipeline parallelism """ - def __init__( - self, - inference_wrapped_model: AbstractModelInferenceWrapper, - tokenizer, - pp_group: ProcessGroup = None, - ): + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): self.inference_wrapped_model = inference_wrapped_model + self.model_config = self.inference_wrapped_model.model.config + inference_config = self.inference_wrapped_model.inference_context.config self.tokenizer = tokenizer - self.pp_group = pp_group + pg_collection = inference_config.pg_collection + if pg_collection is not None: + self.pp_group = pg_collection.pp + else: + self.pp_group = parallel_state.get_pipeline_model_parallel_group() + + self.model_is_pipeline_parallel = self.model_config.pipeline_model_parallel_size > 1 - # For models without pipeline parallelism, is_first_stage and is_last_stage returns True - self.model_is_pipeline_parallel = not ( - is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group) - ) + # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. + # TODO(ksanthanam): Consider deprecating this check if LLaVAModel is no longer used + unwrapped_model = unwrap_model(self.inference_wrapped_model.model) + if isinstance(unwrapped_model, LLaVAModel): + self.vocab_size = unwrapped_model.language_model.vocab_size + else: + self.vocab_size = unwrapped_model.vocab_size - model_config = get_model_config(self.inference_wrapped_model.model) self.sampling_rng = torch.Generator(device=torch.cuda.current_device()) - self.sampling_rng.manual_seed(model_config.inference_sampling_seed) + self.sampling_rng.manual_seed(self.model_config.inference_sampling_seed) if self.inference_wrapped_model.inference_context.is_dynamic_batching(): self._init_dynamic_sampling_tensors() @@ -98,9 +103,7 @@ def _init_dynamic_sampling_tensors(self): self._get_stop_word_finished_ids_callback = None device = torch.cuda.current_device() - logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype - # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. - vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size + logits_dtype = self.inference_wrapped_model.config.params_dtype self._sampling_backend = "torch" self._sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) @@ -505,7 +508,6 @@ def _dynamic_step_context_init( position_ids (Tensor): The active position IDs. """ context = self.inference_wrapped_model.inference_context - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config active_request_slice = slice(context.paused_request_count, context.total_request_count) # Remove Float16Module wrapper if it exists @@ -517,11 +519,11 @@ def _dynamic_step_context_init( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill + symmetric_ar_type = self.model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill # Turning on/off MoE padding for cuda-graphs moe_pad_experts_for_cuda_graph_inference = ( - inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference + self.model_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: if context.using_cuda_graph_this_step(): @@ -569,8 +571,6 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) input_ids (Tensor): The input token IDs. position_ids (Tensor): The position IDs. """ - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - context = self.inference_wrapped_model.inference_context active_request_count = context.total_request_count - context.paused_request_count @@ -582,18 +582,17 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) if self.model_is_pipeline_parallel: logits_seq_len = ( active_request_count - if context.materialize_only_last_token_logits + if context.config.materialize_only_last_token_logits else input_ids.shape[1] ) - vocab_size = inference_wrapper_config.padded_vocab_size - logits_shape = [1, logits_seq_len, vocab_size] + logits_shape = [1, logits_seq_len, self.vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape logits = broadcast_from_last_pipeline_stage( logits_shape, - dtype=inference_wrapper_config.params_dtype, + dtype=self.model_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -639,7 +638,7 @@ def _dynamic_step_sample_logits(self, logits: Tensor): # Last token logits. context = self.inference_wrapped_model.inference_context - if context.materialize_only_last_token_logits: + if context.config.materialize_only_last_token_logits: # When materialize_only_last_token_logits is true, last_token_logits is # already called in the forward pass of GPT. last_token_logits = logits.squeeze(0) @@ -684,7 +683,7 @@ def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]: return context.calculate_log_probs( logits, self._sampled_tokens_cuda[:active_request_count], - only_last_token_logits=context.materialize_only_last_token_logits, + only_last_token_logits=context.config.materialize_only_last_token_logits, ) def _dynamic_step_calculate_top_n_logprobs( @@ -712,7 +711,7 @@ def _dynamic_step_calculate_top_n_logprobs( active_request_slice = slice(context.paused_request_count, context.total_request_count) # Handle decode-only mode (only last token) - if context.materialize_only_last_token_logits or context.is_decode_only(): + if context.config.materialize_only_last_token_logits or context.is_decode_only(): # In decode mode or when only last token logits are materialized, # logits already represent only the last tokens log_probs = log_probs_tensor[:active_request_count] @@ -1024,9 +1023,10 @@ def generate_all_output_tokens_static_batch( # Pad batch tokens if necessary batch_size = len(active_requests) max_sequence_length = max_prompt_length_in_batch + sampling_params.num_tokens_to_generate - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - inference_max_batch_size = inference_wrapper_config.inference_max_requests - inference_max_sequence_length = inference_wrapper_config.inference_max_seq_length + context = self.inference_wrapped_model.inference_context + assert isinstance(context, StaticInferenceContext) + inference_max_batch_size = context.max_batch_size + inference_max_sequence_length = context.max_sequence_length padded_batch_size = inference_max_batch_size if enable_cuda_graph else batch_size if padded_batch_size > inference_max_batch_size: raise ValueError( @@ -1066,10 +1066,6 @@ def generate_all_output_tokens_static_batch( batch_size, device=torch.cuda.current_device() ).cuda() - # Use padded vocab size because tokenizer vocab size might not include padding - # to nearest power of 2 - vocab_size = inference_wrapper_config.padded_vocab_size - # Check whether early termination is enabled no_early_termination = getattr(sampling_params, "no_early_termination", False) termination_id = -1 if no_early_termination else self.tokenizer.eod @@ -1130,14 +1126,14 @@ def generate_all_output_tokens_static_batch( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill + symmetric_ar_type = self.model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill if symmetric_ar_type is not None and nccl_all_reduce_for_prefill: unwrapped_model.set_symmetric_ar(None) # Turning off MoE padding for prefill moe_pad_experts_for_cuda_graph_inference = ( - inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference + self.model_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: set_decode_expert_padding(unwrapped_model, False) @@ -1191,7 +1187,7 @@ def generate_all_output_tokens_static_batch( or not (sampling_params.return_log_probs or sampling_params.top_n_logprobs > 0) ) inference_context = self.inference_wrapped_model.inference_context - inference_context.materialize_only_last_token_logits = ( + inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -1212,14 +1208,14 @@ def generate_all_output_tokens_static_batch( if self.model_is_pipeline_parallel: context_length = context_end_position - context_start_position logits_seq_len = 1 if materialize_only_last_token_logits else context_length - logits_shape = [batch_size, logits_seq_len, vocab_size] + logits_shape = [batch_size, logits_seq_len, self.vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank # and then broadcast the sampled tokens rather than broadcasting the raw logits. logits = broadcast_from_last_pipeline_stage( - [batch_size, logits_seq_len, vocab_size], - dtype=inference_wrapper_config.params_dtype, + [batch_size, logits_seq_len, self.vocab_size], + dtype=self.model_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -1248,7 +1244,7 @@ def generate_all_output_tokens_static_batch( sampled_logits = self.sample_from_logits( last_token_logits, sampling_params, - vocab_size, + self.vocab_size, generation_started=generation_started, top_n_logprobs_dict=top_n_logprobs_dict, logits=logits_for_top_n_prompt_logprobs, diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index e287344c13d..f44aed613e7 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -661,7 +661,7 @@ def _postprocess( ) sequence_parallel_override = False - if in_inference_mode and inference_context.materialize_only_last_token_logits: + if in_inference_mode and inference_context.config.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -691,7 +691,7 @@ def _postprocess( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.materialize_only_last_token_logits + and inference_context.config.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 8d45e1d0147..6d43f5583df 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -267,7 +267,7 @@ def forward( output_weight = self.shared_embedding_or_output_weight() sequence_parallel_override = False - if in_inference_mode and inference_context.materialize_only_last_token_logits: + if in_inference_mode and inference_context.config.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -297,7 +297,7 @@ def forward( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.materialize_only_last_token_logits + and inference_context.config.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index ac6e8b5bf40..0b4ef42457d 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -193,6 +193,9 @@ def _should_call_local_cudagraph(self, *args, **kwargs): and kwargs.get('attention_mask') is None and kwargs.get('inference_context') is not None ): - using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step() + context = kwargs['inference_context'] + using_cuda_graph = (context.is_static_batching() and context.is_decode_only()) or ( + not context.is_static_batching() and context.using_cuda_graph_this_step() + ) return using_cuda_graph return False diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index eaae585905e..48b04c35134 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -696,6 +696,12 @@ class TransformerConfig(ModelParallelConfig): the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False.""" + moe_pad_experts_for_cuda_graph_inference: bool = False + """moe_pad_experts_for_cuda_graph_inference (bool): If True, the router will switch to dropping + and padding during decode time which does not have a D2H sync. The capacity factor is set to the + max that an expert could see during inference so no tokens are actually dropped. The default + setting is False.""" + moe_token_drop_policy: Literal['probs', 'position'] = "probs" """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will @@ -830,6 +836,9 @@ class TransformerConfig(ModelParallelConfig): which is no use of symmetric memory. """ + nccl_all_reduce_for_prefill: bool = False + """If True, use NCCL all-reduce kernels when symmetric all-reduce is enabled.""" + use_inference_optimized_layers: bool = False """If True, use inference optimized transformer layers during inference.""" diff --git a/megatron/core/utils.py b/megatron/core/utils.py index d7b702f25ec..cb2f7d34128 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -496,17 +496,6 @@ def divide(numerator, denominator): return numerator // denominator -def deprecate_inference_params(inference_context, inference_params): - """Print warning for deprecated `inference_params`.""" - if inference_context is None and inference_params is not None: - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be " - "removed in `megatron-core` 0.13." - ) - return inference_params - return inference_context - - def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_initialized=True): """Issue a deprecation warning if tp_group is None and return the default tp group.""" # TODO(zijiey): remove this function later. @@ -2405,25 +2394,6 @@ async def wrapper(*args, **kwargs): return _decorate if func is None else _decorate(func) -def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]: - """Returns Mamba inference state config from the model if it is a hybrid model.""" - from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, - ) - from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols - - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if layer_type_list is not None and Symbols.MAMBA in layer_type_list: - (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() - return MambaInferenceStateConfig( - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, - ) - return None - - # ============================================================================ # Backward Compatibility Decorators # ============================================================================ @@ -2558,3 +2528,43 @@ class ExperimentalModel: """ func._experimental_api = True return func + + +def deprecate_args( + *deprecated_keys, message="Argument '{name}' has been deprecated and should not be used." +): + """ + Intercepts specific keyword arguments to raise a custom TypeError. + + Args: + *deprecated_keys: Strings representing the argument names to block. + message: Custom error message string. Use {name} as a placeholder. + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check if any deprecated key is present in kwargs + found_deprecated = set(deprecated_keys) & set(kwargs.keys()) + + if found_deprecated: + bad_key = list(found_deprecated)[0] + raise TypeError(message.format(name=bad_key)) + + # Send args to the real function + return func(*args, **kwargs) + + return wrapper + + return decorator + + +def deprecate_inference_params(inference_context, inference_params): + """Print warning for deprecated `inference_params`.""" + if inference_context is None and inference_params is not None: + warnings.warn( + "`inference_params` renamed to `inference_context`, and will be " + "removed in `megatron-core` 0.13." + ) + return inference_params + return inference_context diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py new file mode 100644 index 00000000000..26496bfed70 --- /dev/null +++ b/megatron/inference/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py new file mode 100644 index 00000000000..145af726c4f --- /dev/null +++ b/megatron/inference/utils.py @@ -0,0 +1,320 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import logging +from argparse import ArgumentParser +from functools import partial +from typing import Optional + +from gpt_builders import gpt_builder +from mamba_builders import mamba_builder +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts import DynamicInferenceContext +from megatron.core.inference.engines import DynamicInferenceEngine +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( + GPTInferenceWrapper, +) +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, +) +from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_attr_wrapped_model, log_single_rank +from megatron.training import get_args +from megatron.training import get_model as _get_model +from megatron.training import get_tokenizer, get_wandb_writer +from megatron.training.checkpointing import load_checkpoint +from model_provider import model_provider + +logger = logging.getLogger(__name__) + + +def get_model_for_inference() -> MegatronModule: + """Initialize model and load checkpoint for inference.""" + + args = get_args() + + if args.model_provider == "gpt": + model_builder = gpt_builder + elif args.model_provider == "mamba": + model_builder = mamba_builder + else: + raise ValueError(f"Invalid model provider {args.model_provider}") + + # Build model. + model = _get_model(partial(model_provider, model_builder), wrap_with_ddp=False) + + # Load checkpoint. + assert args.load is not None + args.exit_on_missing_checkpoint = True + load_checkpoint( + ddp_model=model, + optimizer=None, + opt_param_scheduler=None, + strict=not args.inference_ckpt_non_strict, + ) + + # No virtual PP. + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + + # Eval mode. + model.eval() + + return model + + +def add_inference_args(parser: ArgumentParser) -> ArgumentParser: + """Add inference command line arguments to the parser.""" + + group = parser.add_argument_group(title='Inference') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument( + "--return-log-probs", + action='store_true', + default=False, + help='Return the log probabilities of the final output tokens', + ) + group.add_argument( + "--prompts", + metavar='N', + type=str, + nargs='+', + help='Input prompts with each prompt within quotes and seperated by space', + ) + group.add_argument( + "--num-tokens-to-prompt", + type=int, + nargs="+", + default=[64, 1024], + help='Number of tokens to use for simulated prompts. This should be a ' + 'space-separated pair of integers, and the generated prompt lengths will ' + 'be uniformly sampled within this range.', + ) + group.add_argument( + "--num-tokens-to-generate", + type=int, + default=30, + help='Number of tokens to generate for each prompt', + ) + group.add_argument( + "--num-tokens-from-file", + action='store_true', + default=False, + help='Use per-prompt num_tokens_to_generate from prompt file', + ) + group.add_argument( + "--top-n-logprobs", + type=int, + default=0, + help=( + "Return the top n logprobs for the generated tokens and their " + "corresponding token as a dictionary" + ), + ) + group.add_argument( + "--incoming-requests-per-step", + type=int, + default=None, + help="Add a deterministic number of requests per step. This arg is " + "prioritized over `--incoming-requests-per-sec` below (which is non-" + "deterministic). Note that the number of requests added per step is " + "additionally limited by the inference context's `max_requests`, " + "`max_tokens`, and KV buffer size.", + ) + group.add_argument( + "--incoming-requests-per-sec", + type=float, + default=100.0, + help="Simulated number of requests per second. Set to -1 to add all requests together.", + ) + group.add_argument( + "--incoming-requests-duration", + type=float, + default=10.0, + help="Total amount of time to simulate that requests are " + "arriving. Multiply this value with " + "`--incoming-requests-per-sec` to get the approximate " + "total number of requests. Set to -1 to add all requests together.", + ) + group.add_argument( + "--model-provider", choices=["mamba", "gpt"], default="gpt", help="Model provider" + ) + group.add_argument( + "--skip-prompt-log-probs", action='store_true', default=False, help='Skip prompt log probs.' + ) + group.add_argument( + "--stop-words", + metavar='WORD', + type=str, + nargs='+', + default=None, + help='Stop words to terminate generation. Each word should be quoted and ' + 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', + ) + group.add_argument( + "--output-path", type=str, default=None, help="Path to save generations as JSON" + ) + group.add_argument( + "--output-every-n-results", + type=int, + default=1, + help="To minimize the output file size of larger runs, only write the " + "results of every `n` requests.", + ) + group.add_argument( + "--prompt-file", + help='Jsonl file containing input prompts, where each item (i.e., line) ' + 'contains the field \'text\' where the value is the prompt. All other ' + 'fields within each item are ignored, and may be customized for each ' + 'application.', + ) + group.add_argument( + "--prompt-file-num-truncate", + type=int, + help='Number of samples to use from the loaded prompt file (see ' + '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' + 'will be used, in order.', + ) + group.add_argument( + "--use-flashinfer-fused-rope", + action='store_true', + default=False, + help='Use flashinfer fused rope implementation.', + ) + group.add_argument( + "--no-record-throughput", + action='store_false', + dest="record_throughput", + help="Disable throughput recording in --output-file", + ) + group.add_argument( + "--inference-ckpt-non-strict", + action="store_true", + help="Load checkpoint with `strict=False`.", + ) + group.add_argument( + "--termination-id", + type=int, + default=None, + help="Termination ID that overrides `tokenizer.eod`.", + ) + group.add_argument( + "--suspend-resume-interval", + type=int, + default=None, + help="Suspend and resume the dynamic engine every " + "`suspend_resume_interval` steps. This is used to tet the suspend/resume " + "system.", + ) + group.add_argument( + "--inference-repeat-n", + type=int, + default=1, + help="Repeat inference iterations N times for benchmarking.", + ) + group.add_argument( + "--throughput-check-only", + action='store_true', + default=False, + help="If true, only run throughput check without verifying outputs.", + ) + + return parser + + +def get_inference_config_from_model_and_args(model: MegatronModule, args): + """Returns a `InferenceConfig` constructed from the model and command line arguments.""" + + # Max sequence length. + position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") + model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") + inf_max_seq_len = args.inference_max_seq_length + max_batch_size = args.inference_dynamic_batching_max_requests + + if position_embedding_type == "learned_absolute": + # When using absolute position embeddings, it is critical that the + # context's `max_sequence_length` is less than or equal to the model's + # `max_sequence_length`. Otherwise, the context's `position_ids` will + # contain ids greater than the dimension of the position embedding + # tensor, which will result in an index error. + if inf_max_seq_len: + max_sequence_length = min(model_max_seq_len, inf_max_seq_len) + else: + max_sequence_length = model_max_seq_len + assert max_batch_size is None or max_batch_size <= model_max_seq_len + else: + max_sequence_length = inf_max_seq_len + if args.inference_dynamic_batching_max_requests is not None: + max_sequence_length = max(max_sequence_length, max_batch_size) + + mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) + pg_collection = get_attr_wrapped_model(model, "pg_collection") + + # Get inference logging configuration from args + log_inference_wandb = args.inference_wandb_logging + inference_logging_step_interval = args.inference_logging_step_interval + + # Get metrics writer if logging is enabled and on the logging rank + # Use the same rank convention as training (last rank logs) + metrics_writer = None + if ( + inference_logging_step_interval > 0 + and log_inference_wandb + and args.rank == (args.world_size - 1) + ): + metrics_writer = get_wandb_writer() + if metrics_writer is None: + log_single_rank( + logger, + logging.WARNING, + "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " + "wandb module is available. Inference logging will be disabled.", + ) + + return InferenceConfig( + block_size_tokens=args.inference_dynamic_batching_block_size, + buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, + paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs + if args.cuda_graph_impl == "local" + else None + ), + max_requests=args.inference_dynamic_batching_max_requests, + max_tokens=args.inference_dynamic_batching_max_tokens, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + offload_kv_cache=args.rl_offload_kv_cache_during_training, + cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, # pylint: disable=line-too-long + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + persist_cuda_graphs=args.rl_training_cuda_graphs, + max_sequence_length=max_sequence_length, + mamba_inference_state_config=mamba_inference_state_config, + pg_collection=pg_collection, + use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, + materialize_only_last_token_logits=not args.return_log_probs, + track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, + enable_chunked_prefill=args.enable_chunked_prefill, + metrics_writer=metrics_writer, + logging_step_interval=args.inference_logging_step_interval, + ) + + +def get_dynamic_inference_engine(model: Optional[MegatronModule] = None) -> DynamicInferenceEngine: + """Builds a `DynamicInferenceEngine`.""" + args = get_args() + if model is None: + model = get_model_for_inference() + if args.legacy_tokenizer: + tokenizer = get_tokenizer() + else: + tokenizer = build_tokenizer(args) + + inference_config = get_inference_config_from_model_and_args(model, args) + context = DynamicInferenceContext(model.config, inference_config) + inference_wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(inference_wrapped_model, tokenizer) + engine = DynamicInferenceEngine(controller, context) + return engine diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py index 4e9364b3ae9..602ff4f7450 100644 --- a/megatron/rl/inference/megatron.py +++ b/megatron/rl/inference/megatron.py @@ -7,7 +7,6 @@ import torch.distributed as dist from pydantic import PrivateAttr -from megatron.core import parallel_state from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine @@ -16,23 +15,13 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import ( - get_attr_wrapped_model, - get_mamba_inference_state_config_from_model, - get_pg_size, - log_single_rank, -) +from megatron.core.utils import get_attr_wrapped_model, log_single_rank from megatron.training import get_wandb_writer from megatron.training.global_vars import get_args, get_tokenizer @@ -66,134 +55,20 @@ def get_static_inference_engine(args: Namespace, model: MegatronModule) -> Abstr """ tokenizer = get_tokenizer() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_seq_length=args.inference_max_seq_length, - inference_max_requests=( - args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 - ), - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - ) - - inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) + inference_wrapped_model = GPTInferenceWrapper(model) pg_collection = get_attr_wrapped_model(model, "pg_collection") pp_group = pg_collection.pp - text_generation_controller = SimpleTextGenerationController( - inference_wrapped_model=inference_wrapped_model, - tokenizer=tokenizer, - pp_group=pp_group, + text_generation_controller = TextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer, pp_group=pp_group ) return MCoreEngine( text_generation_controller=text_generation_controller, max_batch_size=( - args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 + args.inference_max_requests if args.inference_max_requests is not None else 1 ), ) -## This code is copied from tools/run_text_generation_server.py -def get_dynamic_inference_engine( - args: Namespace, - model: MegatronModule, - inference_logging_step_interval: int = 0, - metrics_writer = None -) -> AbstractEngine: - """Get the relevant backend for running inference. - - This function will automatically choose the TRTLLMBackend when possible, - and default to Mcore backend if the user does not specify any backends. - TRTLLMBackend is not implmented yet. - - Args: - args (Namespace): The user arguments parsed from command line - model (MegatronModule): The megatron model. - inference_logging_step_interval (int): Step interval for logging inference metrics. - metrics_writer: Metrics writer (wandb module) for logging. - - Returns: - AbstractBackend: The chosen backend - """ - tokenizer = get_tokenizer() - - enable_cuda_graph = args.cuda_graph_impl == "local" - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - - # DynamicInferenceContext must use the inference model's TP / PP size, not the - # training TP / PP size from global args. The inference model may have a custom - # ProcessGroupCollection with a different TP / PP size. - pg_collection = get_attr_wrapped_model(model, "pg_collection") - tp_group = getattr(pg_collection, 'tp', None) if pg_collection is not None else None - if tp_group is not None: - inference_tp_size = get_pg_size(tp_group) - else: - inference_tp_size = args.tensor_model_parallel_size - pp_group = getattr(pg_collection, 'pp', None) if pg_collection is not None else None - if pp_group is not None: - inference_pp_size = get_pg_size(pp_group) - else: - inference_pp_size = args.pipeline_model_parallel_size - - # Inference context. - inference_context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers // inference_pp_size, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs if enable_cuda_graph else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - pg_collection=pg_collection, # TP/PP sizes are derived from the model's pg_collection. - materialize_only_last_token_logits=True, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=None, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, - metrics_writer=metrics_writer, - persist_cuda_graphs=args.rl_training_cuda_graphs, - offload_kv_cache=args.rl_offload_kv_cache_during_training - ) - - inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context, pg_collection=pg_collection) - - inference_wrapped_model.model_is_pipeline_parallel = not ( - is_pp_first_stage(pg_collection.pp) and is_pp_last_stage(pg_collection.pp) - ) - - pp_group = getattr(pg_collection, "pp", None) - text_generation_controller = SimpleTextGenerationController( - inference_wrapped_model=inference_wrapped_model, - tokenizer=tokenizer, - pp_group=pp_group, - ) - - return DynamicInferenceEngine( - controller=text_generation_controller, - context=inference_context, - random_seed=args.seed, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=inference_logging_step_interval, - pg_collection=pg_collection, - ) - - class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): """Interface to use MCoreEngine directly as an inference engine.""" @@ -246,6 +121,9 @@ async def base_generate(self, request: InferenceRequest): @classmethod async def launch(cls, model: GPTModel, **kwargs): + # Import here to avoid circular imports + from megatron.inference.utils import get_dynamic_inference_engine + args = get_args() tokenizer = get_tokenizer() @@ -256,30 +134,7 @@ async def launch(cls, model: GPTModel, **kwargs): "WARNING: Tokenizer has no BOS token so prompt will not have BOS token", ) - # Get inference logging configuration from args - log_inference_wandb = args.inference_wandb_logging - inference_logging_step_interval = args.inference_logging_step_interval - - # Get metrics writer if logging is enabled and on the logging rank - # Use the same rank convention as training (last rank logs) - metrics_writer = None - if ( - inference_logging_step_interval > 0 - and log_inference_wandb - and args.rank == (args.world_size - 1) - ): - metrics_writer = get_wandb_writer() - if metrics_writer is None: - log_single_rank( - logger, - logging.WARNING, - "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " - "wandb module is available. Inference logging will be disabled.", - ) - - inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine( - args, model, inference_logging_step_interval, metrics_writer - ) + inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(model=model) dp_addr = await inference_engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=41521, launch_inference_coordinator=True, ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 46f3c28b1da..f56bc6c5e2f 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1446,13 +1446,10 @@ def _add_inference_args(parser): dest='use_legacy_static_engine') group.add_argument('--inference-max-requests', type=int, default=8, help='Maximum number of requests for inference.', - dest='inference_max_batch_size') + dest='inference_max_requests') group.add_argument('--inference-max-seq-length', type=int, default=2560, help='Maximum sequence length expected for inference (prefill + decode).', dest='inference_max_seq_length') - group.add_argument('--inference-max-batch-size', type=int, default=None, - help='Maximum batch size for inference.', - dest='inference_max_batch_size') group.add_argument('--inference-dynamic-batching', action='store_true', default=False, help='Enable dynamic batching mode.') @@ -1508,15 +1505,10 @@ def _add_inference_args(parser): '1) allocate `memory_buffer` in unified memory. ' 'Eventually, additional levels will be included to ' 'control other tensors within the context.') - group.add_argument('--nccl-all-reduce-for-prefill', - action='store_true', default=False, - help='When using symmeric all reduce kernels this will use regular nccl kernels for prefill. This can be more effecient when prefill is large as the nccl kernels can be more bandwith optimized') # TODO(ksanthanam): Clean this up in future PR - group.add_argument('--enable-chunked-prefill', dest='disable_chunked_prefill', - action='store_false', default=True, + group.add_argument('--enable-chunked-prefill', dest='enable_chunked_prefill', + action='store_true', default=False, help="Enable chunked prefill (disabled by default)") - group.add_argument('--disable-chunked-prefill', dest='disable_chunked_prefill', - action='store_true', help=argparse.SUPPRESS) group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens', type=int, default=16384, help='Maximum number of tokens to capture in a cuda graph.') @@ -2714,10 +2706,6 @@ def _add_moe_args(parser): group.add_argument('--moe-upcycling-granularity', type=int, default=1, help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. ' 'For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.') - group.add_argument('--moe-pad-experts-for-cuda-graph-inference', action='store_true', - help="some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router will switch" \ - " to dropping and padding during decode time which does not have a D2H sync. The capacity factor is set to the" \ - " max that an expert could see during inference so no tokens are actually dropped.") return parser def _add_mla_args(parser): diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index be00e4b3ce7..1c78b466b1e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -44,6 +44,7 @@ MODEL_ARGS: --flash-decode: true --dist-ckpt-strictness: log_unexpected --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. METRICS: diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index 05e0306bfd8..f3ef0910f58 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -1,14 +1,13 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import contextlib import math import pytest import torch from megatron.core import parallel_state -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts.dynamic_context import ( DynamicInferenceContext, RequestOverflowError, @@ -18,14 +17,21 @@ from megatron.core.inference.sampling_params import SamplingParams from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils -def set_rounder(value): - """Utility function to set the DynamicInferenceContext rounder.""" - DynamicInferenceContext.ROUNDER = value # For backwards compatibility - DynamicInferenceContext.TOKEN_ROUNDER = value - DynamicInferenceContext.REQUEST_ROUNDER = value +@contextlib.contextmanager +def rounder_override(n): + original_token_rounder = DynamicInferenceContext.TOKEN_ROUNDER + original_request_rounder = DynamicInferenceContext.REQUEST_ROUNDER + try: + DynamicInferenceContext.TOKEN_ROUNDER = n + DynamicInferenceContext.REQUEST_ROUNDER = n + yield + finally: + DynamicInferenceContext.TOKEN_ROUNDER = original_token_rounder + DynamicInferenceContext.REQUEST_ROUNDER = original_request_rounder class TestDynamicContext: @@ -52,11 +58,8 @@ def _get_dynamic_context( max_tokens, is_hybrid_model=False, layer_type_list=None, - rounder=64, paused_buffer_size_gb=None, ): - set_rounder(rounder) - if is_hybrid_model: if layer_type_list is None: layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] @@ -69,23 +72,27 @@ def _get_dynamic_context( mamba_inference_state_config = None dynamic_context = DynamicInferenceContext( - params_dtype=params_dtype, - num_layers=num_layers // self.pp_size, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=buffer_size_gb, - paused_buffer_size_gb=( - 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb + model_config=TransformerConfig( + params_dtype=params_dtype, + num_layers=num_layers, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=buffer_size_gb, + paused_buffer_size_gb=( + 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb + ), + block_size_tokens=block_size_tokens, + max_tokens=max_tokens, + mamba_inference_state_config=mamba_inference_state_config, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ), - block_size_tokens=block_size_tokens, - max_tokens=max_tokens, - mamba_inference_state_config=mamba_inference_state_config, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM ) return dynamic_context @@ -93,6 +100,7 @@ def teardown_method(self, method): Utils.destroy_model_parallel() @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_initialize_dynamic_context(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -107,7 +115,6 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool): block_size_tokens=128, max_tokens=None, is_hybrid_model=is_hybrid_model, - rounder=64, ) if not is_hybrid_model: @@ -145,6 +152,7 @@ def test_is_static_batching(self): assert not dynamic_context.is_static_batching() @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_is_memory_available(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -168,6 +176,7 @@ def test_is_memory_available(self, is_hybrid_model): assert not dynamic_context.block_allocator.is_memory_available(1) @pytest.mark.internal + @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_request_overflow(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -181,7 +190,6 @@ def test_request_overflow(self, is_hybrid_model: bool): buffer_size_gb=0.01, block_size_tokens=32, max_tokens=None, - rounder=1, is_hybrid_model=is_hybrid_model, ) dynamic_context.max_requests //= 2 @@ -198,6 +206,7 @@ def test_request_overflow(self, is_hybrid_model: bool): ) # Adding more than allowed requests @pytest.mark.internal + @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_token_overflow_error(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -211,7 +220,6 @@ def test_token_overflow_error(self, is_hybrid_model: bool): buffer_size_gb=0.1, block_size_tokens=128, max_tokens=200, # setting low, but >= context.max_requests. - rounder=1, is_hybrid_model=is_hybrid_model, ) @@ -227,6 +235,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool): ) # Exceeding max token count @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_reset(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -301,6 +310,7 @@ def test_reset(self, is_hybrid_model: bool): assert torch.all(dynamic_context.mamba_metadata.request_to_mamba_state_idx == -1) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_allocate_and_release_memory_blocks(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -349,6 +359,7 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): ) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_add_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -422,6 +433,7 @@ def test_add_request(self, is_hybrid_model: bool): ) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_populates_state(self): self._setup_model_parallel_group(1, 1) @@ -520,6 +532,7 @@ def test_add_dummy_requests_parallel_populates_state(self): ) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): self._setup_model_parallel_group(1, 1) @@ -550,6 +563,7 @@ def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): assert torch.all(dynamic_context.mamba_ssm_states[:, mamba_idx] == 0) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): self._setup_model_parallel_group(1, 1) @@ -575,6 +589,7 @@ def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): assert dynamic_context.num_prefill_requests == 0 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_update_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -774,6 +789,7 @@ def test_update_request(self, is_hybrid_model: bool): ) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): """Test that memory blocks are correctly released for finished requests.""" @@ -846,6 +862,7 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): assert mamba_idx[4] == -1 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): """Test that all memory blocks are correctly released for finished requests that use multiple blocks.""" @@ -913,6 +930,7 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 6 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_mamba_states_cache(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -988,6 +1006,7 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): assert torch.all(ssm_state_layer3 == 40.0) @pytest.mark.internal + @rounder_override(64) def test_calculate_and_store_log_probs(self): self._setup_model_parallel_group(1, 1) dynamic_context = self._get_dynamic_context( @@ -1205,6 +1224,7 @@ def test_calculate_and_store_log_probs(self): current_global_token_offset += expected_len @pytest.mark.internal + @rounder_override(64) def test_pipeline_parallel_uneven_layers(self): """ Test that DynamicInferenceContext synchronizes the total block count across @@ -1215,23 +1235,39 @@ def test_pipeline_parallel_uneven_layers(self): rank = parallel_state.get_pipeline_model_parallel_rank() + mamba_conv_states_shape = (544, 4) + mamba_ssm_states_shape = (8, 64, 16) + if rank == 0: - local_num_layers = 12 + mamba_inference_state_config = MambaInferenceStateConfig( + [Symbols.MAMBA] + [Symbols.ATTENTION] * 4, + mamba_conv_states_shape, + mamba_ssm_states_shape, + ) else: - local_num_layers = 4 + mamba_inference_state_config = MambaInferenceStateConfig( + [Symbols.MAMBA] * 4 + [Symbols.ATTENTION], + mamba_conv_states_shape, + mamba_ssm_states_shape, + ) context = DynamicInferenceContext( - params_dtype=torch.float32, - num_layers=local_num_layers, - kv_channels=64, - num_attention_heads=8, - max_sequence_length=128, - buffer_size_gb=0.1, - block_size_tokens=16, - max_tokens=1024, - pipeline_model_parallel_size=pp_size, - tensor_model_parallel_size=1, - unified_memory_level=0, + model_config=TransformerConfig( + params_dtype=torch.float32, + num_layers=10, + kv_channels=64, + num_attention_heads=8, + pipeline_model_parallel_size=pp_size, + tensor_model_parallel_size=1, + pipeline_dtype=torch.float32, + ), + inference_config=InferenceConfig( + max_sequence_length=128, + buffer_size_gb=0.1, + block_size_tokens=16, + max_tokens=1024, + unified_memory_level=0, + ), ) # Collect the total block counts on each rank diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index d5803b3638e..2e935cab4bd 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -13,9 +13,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts.dynamic_context import ( ActiveRequestCountOverflowError, BlockOverflowError, @@ -28,9 +26,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -48,11 +43,7 @@ from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import ( - get_mamba_inference_state_config_from_model, - is_fa_min_version, - is_te_min_version, -) +from megatron.core.utils import is_fa_min_version, is_te_min_version from tests.unit_tests.test_utilities import Utils @@ -223,26 +214,22 @@ def _build_inference_context( # Inference context. context = DynamicInferenceContext( - params_dtype=transformer_config.params_dtype, - num_layers=transformer_config.num_layers - // transformer_config.pipeline_model_parallel_size, - kv_channels=transformer_config.kv_channels, - num_attention_heads=transformer_config.num_query_groups, - max_sequence_length=test_config.max_sequence_length, - num_cuda_graphs=test_config.num_cuda_graphs, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=test_config.context_buffer_size_gb, - paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, - block_size_tokens=test_config.context_block_size_tokens, - max_requests=test_config.context_max_requests, - max_tokens=test_config.context_max_tokens, - tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, - mamba_inference_state_config=mamba_inference_state_config, - materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM + model_config=transformer_config, + inference_config=InferenceConfig( + max_sequence_length=test_config.max_sequence_length, + num_cuda_graphs=test_config.num_cuda_graphs, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=test_config.context_buffer_size_gb, + paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, + block_size_tokens=test_config.context_block_size_tokens, + max_requests=test_config.context_max_requests, + max_tokens=test_config.context_max_tokens, + mamba_inference_state_config=mamba_inference_state_config, + materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM + ), ) return context @@ -382,17 +369,7 @@ def _build_test_env(cls, test_config): model.eval() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - - # Inference config. - inference_config = InferenceWrapperConfig( - hidden_size=transformer_config.hidden_size, - inference_batch_times_seqlen_threshold=400, - fp32_residual_connection=False, - params_dtype=transformer_config.params_dtype, - fp8=transformer_config.fp8, - padded_vocab_size=test_config.vocab_size, - ) + mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) # Inference context. inference_context = cls._build_inference_context( @@ -403,7 +380,7 @@ def _build_test_env(cls, test_config): ) # Inference model wrapper. - inference_wrapped_model = GPTInferenceWrapper(model, inference_config, inference_context) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). inference_wrapped_model.model_is_pipeline_parallel = not ( @@ -424,13 +401,7 @@ def _build_test_env(cls, test_config): CudaGraphManager.global_mempool = None # Inference engine. - engine = DynamicInferenceEngine( - text_generation_controller, - inference_context, - random_seed=test_config.random_seed, - enable_cuda_graph=transformer_config.cuda_graph_impl == "local", - enable_chunked_prefill=test_config.enable_chunked_prefill, - ) + engine = DynamicInferenceEngine(text_generation_controller, inference_context) # Test env. env = DynamicEngineTestEnv(config=test_config, requests=requests, engine=engine) diff --git a/tests/unit_tests/inference/engines/test_static_engine.py b/tests/unit_tests/inference/engines/test_static_engine.py index 03b3712e39a..483a21d13bd 100644 --- a/tests/unit_tests/inference/engines/test_static_engine.py +++ b/tests/unit_tests/inference/engines/test_static_engine.py @@ -20,9 +20,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -85,20 +82,11 @@ def setup_engine( ).cuda() gpt_model.to(inference_config_params_dtype) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.hidden_size, - inference_batch_times_seqlen_threshold=400, - inference_max_requests=self.batch_size, - fp32_residual_connection=False, - params_dtype=inference_config_params_dtype, - padded_vocab_size=self.vocab_size, + inference_context = StaticInferenceContext( + max_batch_size=self.batch_size, max_sequence_length=self.sequence_length ) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) self.mock_tokenizer = mock.Mock() # Set required tokenizer attributes before engine creation self.mock_tokenizer.vocab_size = self.vocab_size @@ -200,8 +188,6 @@ def test_generate_dynamic(self, batch_size: int, num_trials: int, empty_prompt: assert len(results) == batch_size for result in results: - if isinstance(result, DynamicInferenceRequestRecord): - result = result.merge() assert isinstance(result, InferenceRequest), ( "expected ; found <%s>." % type(result).__name__ ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py index 07afebe1067..d7ddaa1e680 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py @@ -10,9 +10,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, @@ -53,27 +50,15 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): post_process=parallel_state.is_pipeline_last_stage(), ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=20, - inference_max_requests=self.batch_size, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) + inference_context = StaticInferenceContext(self.batch_size, self.sequence_length) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - self.inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + self.inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) def teardown_method(self, method): Utils.destroy_model_parallel() - # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_small_input_batch() @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) - def test_inference_pipeline_parallel_small_size(self, materialize_only_last_token_logits): + def test_inference_pipeline_parallel(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) batch_prompt_tokens = ( @@ -82,7 +67,7 @@ def test_inference_pipeline_parallel_small_size(self, materialize_only_last_toke .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -107,42 +92,6 @@ def test_inference_pipeline_parallel_small_size(self, materialize_only_last_toke self.vocab_size, ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" - # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_large_input_batch() - @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) - def test_inference_pipeline_parallel_large_size(self, materialize_only_last_token_logits): - self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) - - batch_prompt_tokens = ( - torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length)) - .int() - .cuda() - ) - self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( - materialize_only_last_token_logits - ) - - inference_input = self.inference_wrapped_model.prep_inference_input( - prompts_tokens=batch_prompt_tokens - ) - - inference_input_for_context_window = ( - self.inference_wrapped_model.get_batch_for_context_window(inference_input, 0, 10) - ) - - logits_seq_len = 1 if materialize_only_last_token_logits else 10 - - logits = self.inference_wrapped_model.run_one_forward_step( - inference_input_for_context_window - ) - - if parallel_state.is_pipeline_last_stage(): - assert logits.shape == ( - self.batch_size, - logits_seq_len, - self.vocab_size, - ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" - @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1) @@ -153,7 +102,7 @@ def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py index 36d5187b5eb..eb06f6ed78b 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from argparse import Namespace from copy import deepcopy from unittest import mock @@ -7,9 +9,6 @@ from megatron.core import parallel_state from megatron.core.inference.contexts import StaticInferenceContext -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -77,19 +76,9 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): add_decoder=True, ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - self.inference_wrapped_model = T5InferenceWrapper( - t5_model, inference_wrapper_config, inference_context - ) + self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) def teardown_method(self, method): Utils.destroy_model_parallel() diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py deleted file mode 100644 index 794634760d0..00000000000 --- a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py +++ /dev/null @@ -1,21 +0,0 @@ -import torch - -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) - - -class TestModelInferenceWrapperConfig: - - def test_inference_config(self): - inference_config = InferenceWrapperConfig( - hidden_size=10, - inference_batch_times_seqlen_threshold=10, - padded_vocab_size=10, - params_dtype=torch.float, - fp32_residual_connection=False, - ) - inference_config.add_attributes({"abc": 45}) - assert ( - inference_config.abc == 45 - ), f"min tokens not set correctly. it is {inference_config.min_tokens}" diff --git a/tests/unit_tests/inference/test_inference_config.py b/tests/unit_tests/inference/test_inference_config.py new file mode 100644 index 00000000000..6d58328dade --- /dev/null +++ b/tests/unit_tests/inference/test_inference_config.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import dataclasses + +from megatron.core.inference.config import InferenceConfig +from megatron.core.transformer.transformer_config import TransformerConfig + + +class TestInferenceConfig: + def test_mutual_exclusivity_with_transformer_config(self): + """ + Ensure mutual exclusivity between fields in `InferenceConfig` and + `TransformerConfig`. + """ + dynamic_inference_config_fields = set(dataclasses.fields(InferenceConfig)) + transformer_config_fields = set(dataclasses.fields(TransformerConfig)) + assert len(dynamic_inference_config_fields.intersection(transformer_config_fields)) == 0 diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py index cab464af503..1417926f13b 100644 --- a/tests/unit_tests/inference/test_wandb_logging.py +++ b/tests/unit_tests/inference/test_wandb_logging.py @@ -7,6 +7,7 @@ import pytest import torch +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -15,6 +16,7 @@ TextGenerationController, ) from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils @@ -50,20 +52,26 @@ def _get_dynamic_context( max_sequence_length=512, buffer_size_gb=0.03, block_size_tokens=128, + logging_step_interval=0, metrics_writer=None, ): """Helper to create a DynamicInferenceContext.""" return DynamicInferenceContext( - params_dtype=params_dtype, - num_layers=num_layers, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - buffer_size_gb=buffer_size_gb, - block_size_tokens=block_size_tokens, - metrics_writer=metrics_writer, - unified_memory_level=0, # unit tests currently broken with UVM + model_config=TransformerConfig( + params_dtype=params_dtype, + num_layers=num_layers, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + buffer_size_gb=buffer_size_gb, + block_size_tokens=block_size_tokens, + unified_memory_level=0, # unit tests currently broken with UVM + logging_step_interval=logging_step_interval, + metrics_writer=metrics_writer, + ), ) @pytest.mark.internal @@ -195,12 +203,14 @@ def test_kvcache_utilization_stats_types(self): @pytest.mark.internal @patch('megatron.core.inference.engines.dynamic_engine.HAVE_WANDB', True) def test_engine_logging_step_interval_zero(self): - """Test that no logging occurs when inference_logging_step_interval is 0.""" + """Test that no logging occurs when logging_step_interval is 0.""" mock_wandb = Mock() mock_wandb.__name__ = "wandb" mock_wandb.log = Mock() - dynamic_context = self._get_dynamic_context(metrics_writer=mock_wandb) + dynamic_context = self._get_dynamic_context( + logging_step_interval=0, metrics_writer=mock_wandb + ) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -210,12 +220,7 @@ def test_engine_logging_step_interval_zero(self): mock_controller.inference_wrapped_model.model.config = Mock() mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" - engine = DynamicInferenceEngine( - controller=mock_controller, - context=dynamic_context, - random_seed=123, - inference_logging_step_interval=0, # Disabled - ) + engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) # Verify log was never called mock_wandb.log.assert_not_called() @@ -225,15 +230,16 @@ def test_paused_requests_in_stats(self): """Test that paused requests are correctly reflected in stats.""" set_rounder(1) dynamic_context = DynamicInferenceContext( - params_dtype=torch.float32, - num_layers=2, - kv_channels=64, - num_attention_heads=8, - max_sequence_length=128, - num_cuda_graphs=None, - buffer_size_gb=0.01, # Small buffer to force pausing - block_size_tokens=32, - unified_memory_level=0, # unit tests currently broken with UVM + model_config=TransformerConfig( + params_dtype=torch.float32, num_layers=2, kv_channels=64, num_attention_heads=8 + ), + inference_config=InferenceConfig( + max_sequence_length=128, + num_cuda_graphs=None, + buffer_size_gb=0.01, # Small buffer to force pausing + block_size_tokens=32, + unified_memory_level=0, # unit tests currently broken with UVM + ), ) # Add multiple requests to potentially trigger pausing @@ -257,7 +263,7 @@ def test_paused_requests_in_stats(self): @pytest.mark.internal def test_metrics_writer_none_handling(self): """Test that engine handles None metrics_writer gracefully.""" - dynamic_context = self._get_dynamic_context(metrics_writer=None) + dynamic_context = self._get_dynamic_context(logging_step_interval=10, metrics_writer=None) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -268,13 +274,8 @@ def test_metrics_writer_none_handling(self): mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" # Should not raise error even with logging interval set - engine = DynamicInferenceEngine( - controller=mock_controller, - context=dynamic_context, - random_seed=123, - inference_logging_step_interval=10, - ) + engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) # Verify engine was created successfully - assert engine.inference_logging_step_interval == 10 - assert engine.context.metrics_writer is None + assert engine.logging_step_interval == 10 + assert engine.metrics_writer is None diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py index 93a208710fc..5bd39ec1324 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import random import string import time @@ -12,9 +14,6 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -85,19 +84,9 @@ def setup_method(self, method): add_decoder=True, ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - inference_wrapped_model = T5InferenceWrapper( - t5_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py similarity index 96% rename from tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py rename to tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py index 0885401e7a0..bdf95c2d9bf 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py @@ -14,6 +14,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError from megatron.core.inference.inference_request import ( @@ -24,9 +25,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -100,37 +98,24 @@ def setup_model( if dtype == torch.bfloat16: gpt_model = Float16Module(gpt_model.config, gpt_model) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.hidden_size, - inference_batch_times_seqlen_threshold=-1, - inference_max_seq_length=2048, - inference_max_requests=16 if fp8 else self.batch_size, - fp32_residual_connection=False, - params_dtype=dtype, - padded_vocab_size=self.vocab_size, - ) - if static: - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext( + max_batch_size=16 if fp8 else self.batch_size, max_sequence_length=2048 + ) else: inference_context = DynamicInferenceContext( - params_dtype=dtype, - num_layers=transformer_config.num_layers // pipeline_model_parallel_size, - kv_channels=transformer_config.kv_channels, - num_attention_heads=transformer_config.num_attention_heads, - tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, - max_sequence_length=2048, - buffer_size_gb=0.2, - materialize_only_last_token_logits=False, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM + model_config=transformer_config, + inference_config=InferenceConfig( + max_sequence_length=2048, + buffer_size_gb=0.2, + materialize_only_last_token_logits=False, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM + ), ) - inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) inference_wrapped_model.model_is_pipeline_parallel = not ( parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py index 31bf415ba56..50db5cc0afc 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import copy import os import random @@ -13,9 +15,6 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status, VLMInferenceRequest -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import ( VLMInferenceWrapper, ) @@ -92,19 +91,9 @@ def setup_method(self, method): self.image_token_index = self.model.image_token_index self.model = Float16Module(self.model.config, self.model) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.language_hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.language_vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - inference_wrapped_model = VLMInferenceWrapper( - self.model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = VLMInferenceWrapper(self.model, inference_context) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py index cf3bd40ee4b..87aba9c6ed9 100644 --- a/tests/unit_tests/models/test_gpt_model.py +++ b/tests/unit_tests/models/test_gpt_model.py @@ -12,6 +12,7 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams @@ -392,14 +393,18 @@ def test_dynamic_inference_padding_with_fp8(self): config = self.gpt_model.config inference_context = DynamicInferenceContext( - params_dtype=config.params_dtype, - num_layers=config.num_layers, - kv_channels=config.hidden_size // config.num_attention_heads, - num_attention_heads=config.num_attention_heads, - max_sequence_length=self.gpt_model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, + model_config=TransformerConfig( + params_dtype=config.params_dtype, + num_layers=config.num_layers, + kv_channels=config.hidden_size // config.num_attention_heads, + num_attention_heads=config.num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=self.gpt_model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, + ), ) # Add a request with 10 tokens. Since 10 is not a multiple of 64, diff --git a/tests/unit_tests/models/test_gpt_model_batch_invariant.py b/tests/unit_tests/models/test_gpt_model_batch_invariant.py index ead9125e5ec..9ab7e445c0d 100644 --- a/tests/unit_tests/models/test_gpt_model_batch_invariant.py +++ b/tests/unit_tests/models/test_gpt_model_batch_invariant.py @@ -5,17 +5,15 @@ import torch import torch.distributed as dist +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.models.gpt.gpt_model import GPTModel @@ -91,6 +89,8 @@ def _build_flash_attn_bik_model(seq_len: int, vocab_size: int, hidden_size: int normalization="RMSNorm", params_dtype=torch.bfloat16, attention_backend=AttnBackend.flash, + fp32_residual_connection=False, + nccl_all_reduce_for_prefill=False, ) cfg.fp16 = False cfg.bf16 = True @@ -184,32 +184,21 @@ def test_dynamic_engine_matches_batched_forward_rl(self): inference_model = Float16Module(base_model.config, base_model).cuda().eval() ctx = DynamicInferenceContext( - params_dtype=torch.bfloat16, - num_layers=base_model.config.num_layers, - kv_channels=base_model.config.kv_channels, - num_attention_heads=base_model.config.num_attention_heads, - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, + model_config=base_model.config, + inference_config=InferenceConfig( + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, + ), ) - wrapper_cfg = InferenceWrapperConfig( - hidden_size=base_model.config.hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.bfloat16, - padded_vocab_size=vocab_size, - inference_max_seq_length=seq_len, - inference_max_requests=8, - nccl_all_reduce_for_prefill=False, - ) - wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) + wrapper = GPTInferenceWrapper(inference_model, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = SimpleTextGenerationController(wrapper, tokenizer) + controller = TextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) @@ -273,32 +262,21 @@ def test_dynamic_engine_is_batch_invariant(self): def _run_engine_with_order(order): ctx = DynamicInferenceContext( - params_dtype=torch.bfloat16, - num_layers=base_model.config.num_layers, - kv_channels=base_model.config.kv_channels, - num_attention_heads=base_model.config.num_attention_heads, - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, + model_config=based_model.config, + inference_config=InferenceConfig( + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, + ), ) - wrapper_cfg = InferenceWrapperConfig( - hidden_size=base_model.config.hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.bfloat16, - padded_vocab_size=vocab_size, - inference_max_seq_length=seq_len, - inference_max_requests=8, - nccl_all_reduce_for_prefill=False, - ) - wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) + wrapper = GPTInferenceWrapper(inference_model, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = SimpleTextGenerationController(wrapper, tokenizer) + controller = TextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py index 9eb7b2dea9a..29e3630d7bb 100644 --- a/tests/unit_tests/models/test_mamba_model.py +++ b/tests/unit_tests/models/test_mamba_model.py @@ -10,6 +10,7 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts import BaseInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -21,12 +22,7 @@ from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module -from megatron.core.utils import ( - divide, - get_mamba_inference_state_config_from_model, - is_fa_min_version, - is_torch_min_version, -) +from megatron.core.utils import divide, is_fa_min_version, is_torch_min_version from tests.unit_tests.test_utilities import Utils @@ -344,20 +340,17 @@ def test_dynamic_inference_padding_with_fp8(self): self.model.eval() config = self.model.config - mamba_inference_state_config = get_mamba_inference_state_config_from_model( - self.model.module - ) + mamba_inference_state_config = MambaInferenceStateConfig.from_model(self.model.module) inference_context = DynamicInferenceContext( - params_dtype=config.params_dtype, - num_layers=config.num_layers, - kv_channels=config.hidden_size // config.num_attention_heads, - num_attention_heads=config.num_attention_heads, - max_sequence_length=self.model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, - mamba_inference_state_config=mamba_inference_state_config, + model_config=self.model.config, + inference_config=InferenceConfig( + max_sequence_length=self.model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, + mamba_inference_state_config=mamba_inference_state_config, + ), ) # Add a request with 10 tokens. Since 10 is not a multiple of 64 (TOKEN_ROUNDER), diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index 3c7ae93a17c..a5590a0ffad 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -166,6 +166,7 @@ "moe_layer_freq": 1, "moe_layer_recompute": False, "moe_pad_expert_input_to_capacity": False, + "moe_pad_experts_for_cuda_graph_inference": False, "moe_per_layer_logging": False, "moe_permute_fusion": False, "moe_router_bias_update_rate": 0.001, @@ -197,6 +198,7 @@ "mtp_num_layers": None, "mtp_standalone": False, "multi_latent_attention": False, + "nccl_all_reduce_for_prefill": False, "no_rope_freq": None, "no_sync_func": None, "normalization": "RMSNorm", diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py index 615073b8fd0..74f1e69679e 100644 --- a/tools/run_dynamic_text_generation_server.py +++ b/tools/run_dynamic_text_generation_server.py @@ -5,25 +5,19 @@ import torch -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.text_generation_server.dynamic_text_gen_server import run_flask_server -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.utils import get_mamba_inference_state_config_from_model, trace_async_exceptions +from megatron.core.utils import trace_async_exceptions +from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine from megatron.post_training.arguments import add_modelopt_args -from megatron.training import get_args, get_tokenizer +from megatron.training import get_args from megatron.training.initialize import initialize_megatron def add_text_generation_server_args(parser: argparse.ArgumentParser): """Adds the required command line arguments for running the text generation server.""" parser = add_modelopt_args(parser) - parser = add_dynamic_inference_args(parser) + parser = add_inference_args(parser) parser.add_argument("--port", type=int, default=5000, help="Port for Flask server to run on") return parser @@ -74,36 +68,12 @@ async def run_text_generation_server( args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) - args = get_args() - model = get_model() - - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - # Enable return_log_probs to allow prompt logprobs computation for echo=True requests # This sets materialize_only_last_token_logits=False in the inference context, # which is required for lm-eval compatibility (loglikelihood evaluation tasks) + args = get_args() args.return_log_probs = True - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) - - controller = get_inference_controller(model, context) - - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, - ) + engine = get_dynamic_inference_engine() asyncio.run(run_text_generation_server(engine, args.inference_coordinator_port, args.port)) diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py index 32d61444530..430bb7ebb9a 100644 --- a/tools/run_inference_performance_test.py +++ b/tools/run_inference_performance_test.py @@ -10,33 +10,31 @@ from gpt_builders import gpt_builder from mamba_builders import mamba_builder -from megatron.core.inference.contexts import DynamicInferenceContext +from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, StaticInferenceEngine from megatron.core.inference.engines.abstract_engine import AbstractEngine -from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.inference_request import ( + DynamicInferenceRequestRecord, + InferenceRequest, +) from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine from model_provider import model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import asyncio from functools import partial -from typing import List, Union +from typing import List -from examples.inference.gpt.utils import add_common_inference_args from megatron.core import mpu from megatron.training import get_args, get_model, get_tokenizer from megatron.training.checkpointing import load_checkpoint @@ -47,7 +45,7 @@ def add_inference_benchmarking_args(parser): """Inference benchmarking arguments.""" - parser = add_common_inference_args(parser) + parser = add_inference_args(parser) group = parser.add_argument_group(title='inference_benchmarking') @@ -60,7 +58,6 @@ def add_inference_benchmarking_args(parser): group.add_argument( "--benchmark-profile", action="store_true", default=False, help="If set, profile" ) - group.add_argument('--stream', action="store_true", default=False, help="If set, stream tokens") return parser @@ -74,24 +71,13 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs Returns: AbstractBackend: The chosen backend """ - tokenizer = get_tokenizer() - - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_requests=args.inference_max_batch_size, - inference_max_seq_length=args.inference_max_seq_length, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - moe_pad_experts_for_cuda_graph_inference=args.moe_pad_experts_for_cuda_graph_inference, - ) - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) if args.engine_type == "static": - inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) + tokenizer = get_tokenizer() + context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_sequence_length + ) + inference_wrapped_model = GPTInferenceWrapper(model, context) inference_wrapped_model.model_is_pipeline_parallel = not ( mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() ) @@ -100,98 +86,7 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs ) return StaticInferenceEngine(text_generation_controller=text_generation_controller) elif args.engine_type == "dynamic": - context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, - buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, - max_requests_override=args.inference_dynamic_batching_max_requests_override, - max_tokens_override=args.inference_dynamic_batching_max_tokens_override, - block_size_tokens=args.inference_dynamic_batching_block_size, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - ) - inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context=context - ) - inference_wrapped_model.model_is_pipeline_parallel = not ( - mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() - ) - text_generation_controller = TextGenerationController( - inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer - ) - return DynamicInferenceEngine( - text_generation_controller, - context, - termination_id=-1, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - ) - - -async def generate( - inference_engine: Union[StaticInferenceEngine, DynamicInferenceEngine], - sampling_params: SamplingParams, - prompts: List[str], - inference_requests: List[InferenceRequest] = None, -) -> List[InferenceRequest]: - async def collect_stream(prompt, request_id, stream_generator): - async for output in stream_generator: - pass - - if inference_requests is None: - assert prompts is not None - inference_requests = [None for _ in range(len(prompts))] - elif prompts is None: - assert inference_requests is not None - tokenizer = get_tokenizer() - prompts = [tokenizer.detokenize(request.prompt_tokens) for request in inference_requests] - - request_ids: List[int] = [ - inference_engine.add_request( - prompt=prompt, - inference_request=inference_request, - inference_parameters=sampling_params, - streaming=True, - ) - for prompt, inference_request in zip(prompts, inference_requests) - ] - stream_generators = [ - inference_engine.get_stream_generator(request_id) for request_id in request_ids - ] - - tasks = [ - asyncio.create_task(collect_stream(prompt, request_id, stream_generator)) - for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators) - ] - - await inference_engine.run_engine_async() - await asyncio.gather(*tasks) - - results: List[InferenceRequest] = [ - inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids - ] - - return results + return get_dynamic_inference_engine(model=model) def get_random_prompt_tokens(tokenizer, num_input_tokens) -> List[int]: @@ -232,14 +127,12 @@ def generate_dynamic( request_id = REQUEST_ID REQUEST_ID += 1 prompt_tokens = request.prompt_tokens - inference_engine.add_request( - request_id, prompt_tokens, request.inference_parameters, - ) + inference_engine.add_request(request_id, prompt_tokens, request.inference_parameters) start_time = time.perf_counter() all_finished_requests = [] while inference_engine.has_unfinished_requests(): - result = inference_engine.step(verbose=False) + result = inference_engine.step() finished_requests = result["finished_requests"] for request in finished_requests: req_id = request.request_id @@ -257,8 +150,6 @@ def generate_dynamic( def main(): """Main program.""" - # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) - # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) initialize_megatron( extra_args_provider=add_inference_benchmarking_args, args_defaults={ @@ -298,13 +189,14 @@ def main(): return_log_probs=args.return_log_probs, top_n_logprobs=args.top_n_logprobs, num_tokens_to_generate=args.num_tokens_to_generate, + termination_id=-1, ) sampling_params.add_attributes({"no_early_termination": True}) requests = [] if args.num_input_tokens is not None: assert args.prompts is None - batch_size = args.inference_max_batch_size + batch_size = args.inference_max_requests for i in range(batch_size): prompt_tokens = get_random_prompt_tokens(tokenizer, args.num_input_tokens) requests.append( @@ -327,33 +219,27 @@ def main(): ) ) - if args.cuda_graph_impl == "local": - print(f"Running warmup for CUDA graphs...") - warmup_sampling_params = SamplingParams(num_tokens_to_generate=10) - warmup_sampling_params.add_attributes({"no_early_termination": True}) + # TODO(ksanthanam): Use a command line argument for warmup iterations + for i in range(3): + print(f"Running warmup iteration {i+1}...") + warmup_sampling_params = SamplingParams(num_tokens_to_generate=10, termination_id=-1) inference_engine.generate(prompts=["warmup"], sampling_params=warmup_sampling_params) if args.benchmark_profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() - if args.stream: - if args.engine_type == "dynamic": - raise NotImplementedError("Streaming not supported with DynamicInferenceEngine") - results: List[InferenceRequest] = asyncio.run( - generate( - inference_engine, sampling_params, prompts=args.prompts, inference_requests=requests - ) + if args.engine_type == "static": + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params ) else: - if args.engine_type == "static": - results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params - ) - elif args.engine_type == "dynamic": - results: List[InferenceRequest] = generate_dynamic( - args, requests, inference_engine, - ) + prompts = [request.prompt_tokens for request in requests] + records: List[DynamicInferenceRequestRecord] = inference_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) + results: List[InferenceRequest] = [record.merge() for record in records] + end_time = time.perf_counter() latency = end_time - start_time @@ -378,6 +264,10 @@ def main(): result_dict['generated_output'] = tokenizer.detokenize(result.generated_tokens) print(result_dict) + total_output_tokens = args.num_tokens_to_generate * args.inference_max_requests + throughput = total_output_tokens / latency + print(f"Throughput: {throughput} output tokens / second") + if __name__ == "__main__": main() diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index 350173dc16f..89c1cfa5b86 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -22,9 +22,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -63,27 +60,15 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi tokenizer = get_tokenizer() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_seq_length=args.inference_max_seq_length, - inference_max_requests=args.inference_max_batch_size, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference - ) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(args.inference_max_requests, args.inference_max_sequence_length) inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context + model, inference_context ) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) return StaticInferenceEngine( text_generation_controller=text_generation_controller, - max_batch_size=args.inference_max_batch_size, ) @@ -166,14 +151,6 @@ def main(model_type: str = "gpt"): model = model[0] model.eval() - if args.max_batch_size is not None: - assert args.inference_max_batch_size is not None - args.inference_max_batch_size = max(args.inference_max_batch_size, args.max_batch_size) - warnings.warn( - "`--max-batch-size` has been deprecated in favor of `--inference-max-requests`, " - f"setting maximum batch size to {args.inference_max_batch_size}" - ) - inference_engine = get_inference_engine(args, model) if args.cuda_graph_impl == "local": diff --git a/train_rl.py b/train_rl.py index cfc010b3c04..4b5cec5fcc8 100644 --- a/train_rl.py +++ b/train_rl.py @@ -370,6 +370,8 @@ def __getitem__(self, idx): if __name__ == "__main__": + from megatron.inference.utils import add_inference_args + # Temporary for transition to core datasets train_valid_test_datasets_provider.is_distributed = True @@ -401,4 +403,5 @@ def _model_builder( ModelType.encoder_or_decoder, forward_step, args_defaults={}, + extra_args_provider=add_inference_args, ) From 0fe323267fdbb698cd8c435390e5d1fc5b9383ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 31 Jan 2026 13:31:23 +0000 Subject: [PATCH 016/231] Revert "Miscellaneous inference cleanup (#2955)" This reverts commit ffbc43fa352ec29ccb02436e0249de3bb979e2f3. --- .../inference/gpt/gpt_dynamic_inference.py | 339 +++++++++++++---- .../gpt_dynamic_inference_with_coordinator.py | 117 ++++-- .../inference/gpt/gpt_static_inference.py | 71 +++- examples/inference/gpt/utils.py | 232 ++++++++--- examples/rl/README.md | 2 +- .../rl/model_configs/llama3p1_8b_instruct.sh | 3 +- examples/rl/model_configs/nemotron5_56b.sh | 2 +- examples/rl/model_configs/nemotron5_8b.sh | 2 +- .../rl/model_configs/nemotron5p5_12b_H.sh | 2 +- examples/rl/model_configs/nemotron6_3b_moe.sh | 2 +- .../rl/model_configs/qwen3_30b_a3b_moe.sh | 2 +- examples/rl/model_configs/qwen3_32b.sh | 2 +- examples/rl/model_configs/qwen3_4b.sh | 2 +- examples/rl/model_configs/qwen3_8b.sh | 2 +- examples/rl/model_configs/qwen_2p5_32b.sh | 2 +- examples/rl/model_configs/qwen_2p5_3b.sh | 2 +- .../rl/model_configs/qwen_2p5_distill_7b.sh | 2 +- examples/rl/model_configs/qwen_2p5_math_7b.sh | 2 +- megatron/core/inference/config.py | 186 --------- .../attention_context/mamba_metadata.py | 26 +- .../core/inference/contexts/base_context.py | 8 +- .../inference/contexts/dynamic_context.py | 360 ++++++++++++------ .../core/inference/contexts/static_context.py | 14 +- .../core/inference/engines/dynamic_engine.py | 101 +++-- .../core/inference/engines/static_engine.py | 72 ++-- .../abstract_model_inference_wrapper.py | 209 ++++++++-- .../gpt/gpt_inference_wrapper.py | 11 +- .../inference_wrapper_config.py | 66 ++++ .../t5/t5_inference_wrapper.py | 7 +- .../simple_text_generation_controller.py | 5 + .../text_generation_controller.py | 90 ++--- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/models/mamba/mamba_model.py | 4 +- megatron/core/ssm/mamba_layer.py | 5 +- .../core/transformer/transformer_config.py | 9 - megatron/core/utils.py | 70 ++-- megatron/inference/__init__.py | 1 - megatron/inference/utils.py | 320 ---------------- megatron/rl/inference/megatron.py | 167 +++++++- megatron/training/arguments.py | 18 +- .../model_config.yaml | 1 - .../contexts/test_dynamic_context.py | 122 +++--- .../inference/engines/test_dynamic_engine.py | 71 +++- .../inference/engines/test_static_engine.py | 20 +- .../gpt/test_gpt_inference_wrapper.py | 61 ++- .../t5/test_t5_inference_wrapper.py | 19 +- .../test_model_inference_wrapper_config.py | 21 + .../inference/test_inference_config.py | 17 - .../inference/test_wandb_logging.py | 73 ++-- ...oder_decoder_text_generation_controller.py | 19 +- ...test_simple_text_generation_controller.py} | 43 ++- .../test_vlm_text_generation_controller.py | 19 +- tests/unit_tests/models/test_gpt_model.py | 21 +- .../models/test_gpt_model_batch_invariant.py | 80 ++-- tests/unit_tests/models/test_mamba_model.py | 29 +- .../unit_tests/models/test_mamba_moe_model.py | 2 - tools/run_dynamic_text_generation_server.py | 42 +- tools/run_inference_performance_test.py | 180 +++++++-- tools/run_text_generation_server.py | 27 +- train_rl.py | 3 - 60 files changed, 2065 insertions(+), 1346 deletions(-) delete mode 100644 megatron/core/inference/config.py create mode 100644 megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py create mode 100644 megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py delete mode 100644 megatron/inference/__init__.py delete mode 100644 megatron/inference/utils.py create mode 100644 tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py delete mode 100644 tests/unit_tests/inference/test_inference_config.py rename tests/unit_tests/inference/text_generation_controllers/{test_text_generation_controller.py => test_simple_text_generation_controller.py} (96%) diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 7fcac70c11a..88b744b3ac0 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -1,31 +1,40 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# pylint: disable=bad-builtin - import hashlib import io import json +import math import os +import pickle import sys import warnings -from collections import defaultdict -from typing import Dict, List, Optional - import torch +from argparse import ArgumentParser +from collections import defaultdict +from functools import partial from tqdm import tqdm +from typing import Dict, List, Tuple, Optional sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) +import megatron from examples.inference.gpt.utils import ( Request, + add_common_inference_args, build_dynamic_engine_setup_prefix, build_requests, get_curr_time, get_global_peak_memory_stats_bytes, ) -from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.contexts.dynamic_context import ( + ContextOverflowError, + DynamicInferenceContext, +) +from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, +) from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, @@ -35,26 +44,194 @@ TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.inference.utils import ( - add_inference_args, - get_inference_config_from_model_and_args, - get_model_for_inference, -) +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_mamba_inference_state_config_from_model sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import logging +from megatron.training import get_args, get_model as _get_model, get_tokenizer, initialize_megatron +from megatron.training.checkpointing import load_checkpoint +from model_provider import model_provider +from gpt_builders import gpt_builder +from mamba_builders import mamba_builder -import megatron from megatron.core.utils import configure_nvtx_profiling -from megatron.training import get_args, get_tokenizer, initialize_megatron +import logging torch.serialization.add_safe_globals([io.BytesIO]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) +def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: + """Dynamic inference arguments.""" + + add_common_inference_args(parser) + + group = parser.add_argument_group(title='Dynamic inference') + group.add_argument( + "--inference-ckpt-non-strict", + action="store_true", + help="Load checkpoint with `strict=False`.", + ) + group.add_argument( + "--termination-id", type=int, default=None, + help="Termination ID that overrides `tokenizer.eod`.", + ) + group.add_argument( + "--suspend-resume-interval", type=int, default=None, + help="Suspend and resume the dynamic engine every " + "`suspend_resume_interval` steps. This is used to tet the suspend/resume " + "system.", + ) + group.add_argument( + "--inference-repeat-n", type=int, default=1, + help="Repeat inference iterations N times for benchmarking." + ) + group.add_argument( + "--throughput-check-only", + action='store_true', + default=False, + help="If true, only run throughput check without verifying outputs." + ) + + return parser + + +def get_model() -> MegatronModule: + """Initialize model and load checkpoint.""" + + args = get_args() + + if args.model_provider == "gpt": + model_builder = gpt_builder + elif args.model_provider == "mamba": + model_builder = mamba_builder + else: + raise ValueError(f"Invalid model provider {args.model_provider}") + + # Build model. + model = _get_model( + partial(model_provider, model_builder), + wrap_with_ddp=False + ) + + # Load checkpoint. + assert args.load is not None + args.exit_on_missing_checkpoint = True + load_checkpoint( + ddp_model=model, + optimizer=None, + opt_param_scheduler=None, + strict=not args.inference_ckpt_non_strict, + ) + + # No virtual PP. + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + + # Eval mode. + model.eval() + + return model + + +def get_inference_context( + requests: List[Request], + sampling_params: Optional[SamplingParams] = None, + calculate_max_sequence_length_from_requests: bool = True, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, +): + """The inference context manages the KV cache and other inference state.""" + + args = get_args() + + # Max sequence length. + if calculate_max_sequence_length_from_requests: + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + max_sequence_length = max_context_length + max_gen_length + else: + max_sequence_length = args.inference_max_seq_length + + metrics_writer = None + if args.inference_logging_step_interval > 0 and args.inference_wandb_logging: + metrics_writer = get_wandb_writer() + + # Inference context. + context = DynamicInferenceContext( + params_dtype=args.params_dtype, + num_layers=args.num_layers // args.pipeline_model_parallel_size, + kv_channels=args.kv_channels, + num_attention_heads=( + args.num_query_groups if args.group_query_attention else args.num_attention_heads + ), + max_sequence_length=max_sequence_length, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs + if args.cuda_graph_impl == "local" + else None + ), + block_size_tokens=args.inference_dynamic_batching_block_size, + buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, + paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, + max_requests=args.inference_dynamic_batching_max_requests, + max_tokens=args.inference_dynamic_batching_max_tokens, + tensor_model_parallel_size=args.tensor_model_parallel_size, + pipeline_model_parallel_size=args.pipeline_model_parallel_size, + materialize_only_last_token_logits=not args.return_log_probs, + mamba_inference_state_config=mamba_inference_state_config, + cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, + kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, + qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, + cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, + metrics_writer=metrics_writer, + offload_kv_cache=args.rl_offload_kv_cache_during_training + ) + + return context + + +def get_inference_controller( + model: MegatronModule, context: DynamicInferenceContext +) -> TextGenerationController: + """Buid text generation controller, which manages the model inference context. + + Args: + model (MegatronModule): Megatron GPT model. + context (DynamicInferenceContext): Context for managing KV cache blocks. + + Return: + (TextGenerationController) Inference text generation controller. + """ + + args = get_args() + if args.legacy_tokenizer: + tokenizer = get_tokenizer() + else: + tokenizer = build_tokenizer(args) + + # Wrap model in inference wrapper. + model = GPTInferenceWrapper(model, args, context) + + # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). + from megatron.core import parallel_state + + model.model_is_pipeline_parallel = not ( + parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() + ) + + # Text generation controller. + controller = TextGenerationController(model, tokenizer) + + return controller + + def run_inference( requests: List[Request], engine: DynamicInferenceEngine, @@ -107,7 +284,11 @@ def _add_request(): """ nonlocal num_requests_added _request = requests[num_requests_added] - engine.add_request(num_requests_added, _request.prompt_text, _request.sampling_params) + engine.add_request( + num_requests_added, + _request.prompt_text, + _request.sampling_params, + ) _request.time_start = get_curr_time() _request.state = "started" num_requests_added += 1 @@ -124,9 +305,10 @@ def _add_request(): _add_request() else: # Add deterministic number of requests (generally used for debugging). - for i in range( - min(args.incoming_requests_per_step, num_requests_total - num_requests_added) - ): + for i in range(min( + args.incoming_requests_per_step, + num_requests_total - num_requests_added, + )): _add_request() add_times.append(get_curr_time() - add_start) @@ -136,12 +318,11 @@ def _add_request(): result = engine.step_modern() except EngineSuspendedError as e: result = e - pass # ignore error in order to call 'engine.resume()' below. + pass # ignore error in order to call 'engine.resume()' below. attempted_step_count += 1 - # After step, we lost track of last iteration's is_decode_only, - # so we need to get it from the engine - is_decode_only = engine.is_decode_only + # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine + is_decode_only = engine.is_decode_only # Test suspending and resuming engine. if args.suspend_resume_interval is not None: @@ -154,9 +335,9 @@ def _add_request(): # Resume, 0+ attempted steps later. if ( attempted_step_count > 0 - and (attempted_step_count - args.suspend_resume_interval // 2) - % args.suspend_resume_interval - == 0 + and + (attempted_step_count - args.suspend_resume_interval // 2) + % args.suspend_resume_interval == 0 ): print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count)) engine.resume() @@ -168,9 +349,7 @@ def _add_request(): # Record cuda_graph_request_count. cuda_graph_request_count = result["cuda_graph_request_count"] if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None: - cuda_graph_request_count_map[cuda_graph_request_count] = ( - cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 - ) + cuda_graph_request_count_map[cuda_graph_request_count] = cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 # Update requests. active_request_ids = result["active_request_ids"] @@ -229,29 +408,29 @@ def _add_request(): engine.resume() return { - "step_times": step_times, - "add_times": add_times, - "output_times": output_times, - "total_output_tokens": total_output_tokens, - "cuda_graph_request_count_map": cuda_graph_request_count_map, + "step_times" : step_times, + "add_times" : add_times, + "output_times" : output_times, + "total_output_tokens" : total_output_tokens, + "cuda_graph_request_count_map" : cuda_graph_request_count_map, } @torch.inference_mode() def main(): - """Run dynamic inference.""" + # Initialize Megatron. initialize_megatron( - extra_args_provider=add_inference_args, + extra_args_provider=add_dynamic_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) # Start Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - - level_str = os.getenv("LOG_LEVEL", "INFO").upper() - level = getattr(logging, level_str, logging.INFO) + + level_str = os.getenv("LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_str, logging.INFO) logging.basicConfig(level=level, force=True) configure_nvtx_profiling(True) @@ -277,36 +456,42 @@ def main(): termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, top_n_logprobs=args.top_n_logprobs, stop_words=args.stop_words, - ) + ) + + model = get_model() - model = get_model_for_inference() + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) # Requests, context, controller. requests = build_requests(args, tokenizer, sampling_params) - inference_config = get_inference_config_from_model_and_args(model, args) - - # Calculate max_sequence_length from requests - max_gen_length = sampling_params.num_tokens_to_generate - max_context_length = max(len(r.prompt_tokens) for r in requests) - inference_config.max_sequence_length = max_context_length + max_gen_length - context = DynamicInferenceContext(model.config, inference_config) - wrapped_model = GPTInferenceWrapper(model, context) - controller = TextGenerationController(wrapped_model, tokenizer) + context = get_inference_context( + requests, + sampling_params, + mamba_inference_state_config=mamba_inference_state_config, + ) + controller = get_inference_controller(model, context) # Validate all context_length's <= max_tokens. - if not args.enable_chunked_prefill: + if args.disable_chunked_prefill: invalid_prompt_length_map = {} for request_idx, request in enumerate(requests): if len(request.prompt_tokens) > context.max_tokens: invalid_prompt_length_map[request_idx] = len(request.prompt_tokens) - assert ( - not invalid_prompt_length_map - ), "request idxs with prompts longer than context.max_tokens: " ", ".join( - f"{k}({v})" for k, v in invalid_prompt_length_map.items() + assert not invalid_prompt_length_map, ( + "request idxs with prompts longer than context.max_tokens: " + ", ".join(f"{k}({v})" for k, v in invalid_prompt_length_map.items()) ) # Inference engine. - engine = DynamicInferenceEngine(controller, context) + engine = DynamicInferenceEngine( + controller, + context, + enable_cuda_graph=args.cuda_graph_impl == "local", + random_seed=args.seed, + track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, + enable_chunked_prefill=not args.disable_chunked_prefill, + inference_logging_step_interval=args.inference_logging_step_interval, + ) setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") @@ -337,13 +522,14 @@ def main(): # Validate all requests finished. for request in requests: - assert request.state == "finished", f"request.state == '{request.state}' != 'finished'." + assert request.state == "finished", ( + f"request.state == '{request.state}' != 'finished'." + ) peak_mem_stats = get_global_peak_memory_stats_bytes() # Print unique prompts + outputs. if torch.distributed.get_rank() == 0: - def escape_str(s): return s.replace("\n", "\\n") @@ -361,10 +547,7 @@ def escape_str(s): # ---- Prompt summary line ---- prompt_len = len(requests[request_idxs[0]].prompt_tokens) escaped_prompt_text = escape_str(prompt_text) - print( - f"\n{unique_idx+1}/{len(unique_prompt_map)}" - f"[n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}" - ) + print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}") # ---- Group all outputs for this prompt ---- output_map = defaultdict(list) @@ -384,17 +567,16 @@ def escape_str(s): # Use hash of prompt + generated text in case engine was # suspended and resumed, which misaligns boundary between # prompt and generated tokens. - o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] + o_hash = hashlib.sha256( + (prompt_text + output_text).encode() + ).hexdigest()[:6] o_len = len(requests[output_request_idxs[0]].output_tokens) escaped_output_text = escape_str(output_text) else: o_hash = "--" o_len = 0 escaped_output_text = "--" - print( - f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" - f"{', ' if evicted else ''}] {escaped_output_text}" - ) + print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}{', ' if evicted else ''}] {escaped_output_text}") text_hashes.append(o_hash) # Write results to JSON. Primarily used for functional testing. @@ -410,16 +592,14 @@ def escape_str(s): "generated_text": req.output_text, "generated_tokens": req.output_tokens, "latency": req.time_end - req.time_start, - "cuda_graph_request_count_map": result["cuda_graph_request_count_map"], - "step_count": engine.step_count, - "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), - "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), + "cuda_graph_request_count_map" : result["cuda_graph_request_count_map"], + "step_count" : engine.step_count, + "top_n_logprobs" : getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs" : getattr(req, 'prompt_top_n_logprobs', None), } if req.sampling_params.return_log_probs: result_dict["prompt_logprobs"] = getattr(req, 'prompt_log_probs', None) - result_dict["generated_logprobs"] = getattr( - req, 'generated_log_probs', None - ) + result_dict["generated_logprobs"] = getattr(req, 'generated_log_probs', None) result_dict["logprobs"] = getattr(req, 'logprobs', None) json_results[req.request_id] = result_dict @@ -451,7 +631,7 @@ def escape_str(s): d_count = len(d_times) p_mean = p_total / p_count - d_mean = d_total / d_count if d_count != 0 else 0.0 + d_mean = d_total / d_count if d_count != 0 else 0. # Commented out for now as the step/add/output times are not calculated correctly. # print( @@ -463,13 +643,18 @@ def escape_str(s): # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " # f"count [ p {p_count}, d {d_count} ]." # ) - capture_str = f"{engine.capture_stats['time']:.2f} sec" if engine.capture_stats else "--" + capture_str = ( + f"{engine.capture_stats['time']:.2f} sec" + if engine.capture_stats else + "--" + ) print( - f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", + f"{setup_prefix} … " + f"throughput: {throughput:.3f} tok/s … ", f"total time: {total_time:.3f}s … " f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " f"steps: {engine.step_count:d} … " - f"capture {capture_str}", + f"capture {capture_str}" ) print("~~~") diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index ab84ee5bf5c..cbb7a1aa745 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -2,33 +2,43 @@ import asyncio import json -import logging import os import time -import warnings +import torch +import torch.distributed as dist from collections import defaultdict +from tqdm import tqdm from typing import List +import warnings +import logging -import torch -import torch.distributed as dist +from examples.inference.gpt.gpt_dynamic_inference import ( + add_dynamic_inference_args, + get_inference_context, + get_inference_controller, + get_model, +) +from examples.inference.gpt.utils import ( + Request, + build_dynamic_engine_setup_prefix, + build_requests, + add_common_inference_args +) -from examples.inference.gpt.utils import Request, build_dynamic_engine_setup_prefix, build_requests +from megatron.core import parallel_state from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.inference_request import DynamicInferenceRequestRecord from megatron.core.inference.sampling_params import SamplingParams -from megatron.inference.utils import ( - add_inference_args, - get_dynamic_inference_engine, - get_model_for_inference, -) +from megatron.core.utils import get_mamba_inference_state_config_from_model + from megatron.training import get_args, get_tokenizer, initialize_megatron +from megatron.training.arguments import parse_args # pylint: disable=line-too-long logging.basicConfig(level=logging.INFO, force=True) - async def main( engine: DynamicInferenceEngine, requests: List[Request], @@ -41,11 +51,12 @@ async def main( "Sampling parameters are specified per request.", DeprecationWarning, ) - + # once you call engine.start_listening_to_data_parallel_coordinator, # the engine will start accepting requests from the data parallel coordinator. # and processing them in an asyncio coroutine. # leaving inference_coordinator_port as None will find a free port automatically. + dp_addr = await engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=port, launch_inference_coordinator=True, @@ -58,11 +69,14 @@ async def main( # Since the client doesn't directly call engine.async_step here, we test # the suspend-resume system ~4 times. suspend_resume_interval = max(1, len(requests) // 4) - suspend_idxs = set( - range(suspend_resume_interval, len(requests) + 1, suspend_resume_interval) - ) + suspend_idxs = set(range( + suspend_resume_interval, + len(requests) + 1, + suspend_resume_interval, + )) resume_idxs = set( - min(len(requests), i + suspend_resume_interval // 2) for i in suspend_idxs + min(len(requests), i + suspend_resume_interval // 2) + for i in suspend_idxs ) else: suspend_idxs = set() @@ -84,10 +98,7 @@ async def main( current_time = time.time_ns() / 10**9 if args.incoming_requests_per_step is None: # Only add requests that have arrived at the current time. - while ( - num_requests_added < num_requests_total - and requests[num_requests_added].time_arrival <= current_time - ): + while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: request = requests[num_requests_added] # These add-request calls will queue up the request on a zmq socket and return # instantaneously. They will return an asyncio future which can be awaited for @@ -103,9 +114,10 @@ async def main( else: # Add deterministic number of requests (generally used for debugging). - for i in range( - min(args.incoming_requests_per_step, num_requests_total - num_requests_added) - ): + for i in range(min( + args.incoming_requests_per_step, + num_requests_total - num_requests_added + )): # Change sampling parameters to force different generation lengths. request = requests[num_requests_added] n = request.sampling_params.num_tokens_to_generate @@ -123,7 +135,7 @@ async def main( break # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. await asyncio.sleep(0) - + # While we wait for the requests to complete, the engine runs in the background. results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) @@ -158,19 +170,16 @@ async def main( req = record.merge() unique_prompt_map[req.prompt].append(req) for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): - print( - f"%d/%d. prompt '%s' ... [%d] output '%s'." - % ( - idx, - len(unique_prompt_map), - prompt_text.replace("\n", "\\n"), - len(reqs), - reqs[0].generated_text.replace("\n", "\\n"), - ) - ) + print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % ( + idx, + len(unique_prompt_map), + prompt_text.replace("\n", "\\n"), + len(reqs), + reqs[0].generated_text.replace("\n", "\\n"), + )) # kill the engines and suspend the client - # Right now, we can only call stop when all requests are done. + # Right now, we can only call stop when all requests are done. # Todo: Make this explicit in the Client class.... await client.stop_engines() client.stop() @@ -181,11 +190,11 @@ async def main( if __name__ == "__main__": - # enable inference mode in the very beginning as some fp8 optimizations + # enable inference mode in the very beginning as some fp-8 optimizations # check for it. with torch.inference_mode(): initialize_megatron( - extra_args_provider=add_inference_args, + extra_args_provider=add_dynamic_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) @@ -204,16 +213,34 @@ async def main( ), ) - model = get_model_for_inference() - + # Requests, context, conroller. + model = get_model() + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) requests = ( build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None ) - engine = get_dynamic_inference_engine(model=model) + context = get_inference_context( + None, + None, + calculate_max_sequence_length_from_requests=False, + mamba_inference_state_config=mamba_inference_state_config, + ) + + controller = get_inference_controller(model, context) + + # Inference engine. + engine = DynamicInferenceEngine( + controller, + context, + enable_cuda_graph=args.cuda_graph_impl == "local", + random_seed=args.seed, + enable_chunked_prefill=not args.disable_chunked_prefill, + inference_logging_step_interval=args.inference_logging_step_interval, + ) if dist.get_rank() == 0: - setup_prefix = build_dynamic_engine_setup_prefix(args, model, engine.context, requests) + setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") print(setup_prefix) print("~~~") @@ -222,7 +249,13 @@ async def main( if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - asyncio.run(main(engine, requests, args.inference_coordinator_port)) + asyncio.run( + main( + engine, + requests, + args.inference_coordinator_port, + ) + ) # Stop Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index 298ebfebd86..03a60927ab2 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -1,11 +1,21 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import os +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from model_provider import model_provider +from gpt_builders import gpt_builder +from mamba_builders import mamba_builder +import torch import sys import time +import warnings +from functools import partial from argparse import Namespace import torch +import tqdm from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine @@ -13,12 +23,17 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule +from pretrain_gpt import model_provider as gpt_model_provider +from pretrain_mamba import model_provider as mamba_model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) @@ -26,18 +41,18 @@ import asyncio import json -from typing import List +from typing import Any, AsyncIterator, List -from examples.inference.gpt.utils import build_requests -from megatron.inference.utils import add_inference_args, get_model_for_inference -from megatron.training import get_args, get_tokenizer, print_rank_0 +from examples.inference.gpt.utils import add_common_inference_args, build_requests +from megatron.core import mpu +from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 +from megatron.training.checkpointing import load_checkpoint from megatron.training.initialize import initialize_megatron - def add_static_inference_args(parser): """Static inference arguments.""" - add_inference_args(parser) + add_common_inference_args(parser) group = parser.add_argument_group(title='Static inference') group.add_argument( @@ -68,16 +83,30 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere tokenizer = get_tokenizer() else: tokenizer = build_tokenizer(args) - inference_context = StaticInferenceContext( - args.inference_max_requests, args.inference_max_seq_length + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + inference_max_requests=args.inference_max_batch_size, + inference_max_seq_length=args.inference_max_seq_length, + nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, + fp8=args.fp8, + moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference + ) + + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + + inference_wrapped_model = GPTInferenceWrapper( + model, inference_wrapper_config, inference_context ) - inference_wrapped_model = GPTInferenceWrapper(model, inference_context) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) engine_kwargs = { - "text_generation_controller": text_generation_controller, - "legacy": args.use_legacy_static_engine, + "text_generation_controller" : text_generation_controller, + "legacy" : args.use_legacy_static_engine, } if not args.use_legacy_static_engine: engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb @@ -136,7 +165,22 @@ def main(): args = get_args() - model = get_model_for_inference() + if args.max_batch_size is not None: + warnings.warn( + f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`." + ) + args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size) + + # Set up model and load checkpoint + if args.model_provider == "gpt": + model_builder = gpt_builder + elif args.model_provider == "mamba": + model_builder = mamba_builder + else: + raise ValueError(f"Invalid model provider {args.model_provider}") + model = get_model(partial(model_provider, model_builder), wrap_with_ddp=False) + load_checkpoint(model, None, None, strict=False) + model = model[0] inference_engine = get_inference_engine(args, model) @@ -232,7 +276,7 @@ def main(): ) ), len(requests), - args.inference_max_requests, + args.inference_max_batch_size, stats["allocated_bytes.all.peak"] / (1024**3), stats["reserved_bytes.all.peak"] / (1024**3), latency, @@ -249,5 +293,6 @@ def main(): torch.distributed.destroy_process_group() + if __name__ == "__main__": main() diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py index b7a3977605c..a04b856c0a6 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/gpt/utils.py @@ -1,23 +1,158 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy -import itertools import json +import itertools import random import time -from argparse import ArgumentParser, Namespace -from functools import partial -from typing import Any, List, Optional - import torch +from argparse import ArgumentParser, Namespace from tqdm import tqdm +from typing import Any, List, Optional +from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.contexts import DynamicInferenceContext from megatron.core.inference.contexts.dynamic_context import get_mem_size_str -from megatron.core.inference.inference_request import DynamicInferenceRequest -from megatron.core.inference.sampling_params import SamplingParams from megatron.core.transformer.module import MegatronModule -from megatron.training import get_args + +from megatron.core.inference.sampling_params import SamplingParams + + +def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: + """Common inference arguments.""" + + group = parser.add_argument_group(title='Common inference') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument( + "--return-log-probs", + action='store_true', + default=False, + help='Return the log probabilities of the final output tokens', + ) + group.add_argument( + "--prompts", + metavar='N', + type=str, + nargs='+', + help='Input prompts with each prompt within quotes and seperated by space', + ) + group.add_argument( + "--num-tokens-to-prompt", + type=int, + nargs="+", + default=[64, 1024], + help='Number of tokens to use for simulated prompts. This should be a ' + 'space-separated pair of integers, and the generated prompt lengths will ' + 'be uniformly sampled within this range.', + ) + group.add_argument( + "--num-tokens-to-generate", + type=int, + default=30, + help='Number of tokens to generate for each prompt', + ) + group.add_argument( + "--num-tokens-from-file", + action='store_true', + default=False, + help='Use per-prompt num_tokens_to_generate from prompt file', + ) + group.add_argument( + "--top-n-logprobs", + type=int, + default=0, + help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary', + ) + group.add_argument( + "--incoming-requests-per-step", + type=int, default=None, + help="Add a deterministic number of requests per step. This arg is " + "prioritized over `--incoming-requests-per-sec` below (which is non-" + "deterministic). Note that the number of requests added per step is " + "additionally limited by the inference context's `max_requests`, " + "`max_tokens`, and KV buffer size.", + ) + group.add_argument( + "--incoming-requests-per-sec", + type=float, + default=100.0, + help="Simulated number of requests per second. Set to -1 to add all requests together.", + ) + group.add_argument( + "--incoming-requests-duration", + type=float, + default=10.0, + help="Total amount of time to simulate that requests are " + "arriving. Multiply this value with " + "`--incoming-requests-per-sec` to get the approximate " + "total number of requests. Set to -1 to add all requests together.", + ) + group.add_argument( + "--model-provider", + choices=["mamba", "gpt"], + default="gpt", + help="Model provider", + ) + group.add_argument( + "--skip-prompt-log-probs", + action='store_true', + default=False, + help='Skip prompt log probs.', + ) + group.add_argument( + "--stop-words", + metavar='WORD', + type=str, + nargs='+', + default=None, + help='Stop words to terminate generation. Each word should be quoted and ' + 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', + ) + group.add_argument( + "--output-path", + type=str, + default=None, + help="Path to save generations as JSON", + ) + group.add_argument( + "--output-every-n-results", + type=int, + default=1, + help="To minimize the output file size of larger runs, only write the " + "results of every `n` requests.", + ) + group.add_argument( + "--prompt-file", + help='Jsonl file containing input prompts, where each item (i.e., line) ' + 'contains the field \'text\' where the value is the prompt. All other ' + 'fields within each item are ignored, and may be customized for each ' + 'application.', + ) + group.add_argument( + "--prompt-file-num-truncate", + type=int, + help='Number of samples to use from the loaded prompt file (see ' + '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' + 'will be used, in order.', + ) + group.add_argument( + "--use-flashinfer-fused-rope", + action='store_true', + default=False, + help='Use flashinfer fused rope implementation.', + ) + group.add_argument( + "--no-record-throughput", + action='store_false', + dest="record_throughput", + help="Disable throughput recording in --output-file" + + ) + + return parser def get_default_sampling_params(termination_id: int = None): @@ -27,10 +162,9 @@ def get_default_sampling_params(termination_id: int = None): top_p=0.0, return_log_probs=False, num_tokens_to_generate=30, - termination_id=termination_id, + termination_id = termination_id, ) - def get_curr_time() -> float: """Get synchronized time across ranks.""" curr_time = torch.cuda.LongTensor([time.time_ns()]) @@ -54,13 +188,7 @@ class Request: tokenizer (Any): Tokenizer for tokenizing the prompt. """ - def __init__( - self, - prompt_text: str, - time_offset: float, - tokenizer: Any, - sampling_params: SamplingParams = None, - ): + def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, sampling_params: SamplingParams = None): self.prompt_text = prompt_text self.prompt_tokens = tokenizer.tokenize(prompt_text) self.output_text = None @@ -70,11 +198,7 @@ def __init__( self.time_start = None self.time_end = None self.state = "not-started" - self.sampling_params: SamplingParams = ( - sampling_params - if sampling_params is not None - else get_default_sampling_params(tokenizer.eod) - ) + self.sampling_params: SamplingParams = sampling_params if sampling_params is not None else get_default_sampling_params(tokenizer.eod) self.sampling_params = copy.deepcopy(self.sampling_params) def __str__(self) -> str: @@ -101,10 +225,10 @@ def get_time_offsets( # if num_requests is not None: incoming_requests_duration = num_requests / incoming_requests_per_sec - incoming_requests_duration *= 2 # extra margin, to accomodate time sampling + incoming_requests_duration *= 2 # extra margin, to accomodate time sampling random.seed(seed) - + import simpy # Guard against this import in test case # Generate random time offsets. @@ -117,14 +241,14 @@ def arrival(r): env = simpy.Environment() env.process(arrival(incoming_requests_per_sec)) env.run(incoming_requests_duration) - + # Ensure at least a single request. if len(time_offsets) == 0: time_offsets = [0.0] # Ensure first time is 0. time_offsets = [to - time_offsets[0] for to in time_offsets] - + # Truncate to num_requests. assert len(time_offsets) >= num_requests time_offsets = time_offsets[:num_requests] @@ -133,7 +257,7 @@ def arrival(r): def get_cli_requests( - args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None ) -> list[Request]: # Get time offsets. @@ -145,7 +269,7 @@ def get_cli_requests( ) # Init requests. - requests = [Request(p, t, tokenizer, sampling_params) for p, t in zip(args.prompts, t_offsets)] + requests = [Request(p, t, tokenizer, sampling_params) for p,t in zip(args.prompts, t_offsets)] return requests @@ -165,14 +289,18 @@ def get_synthetic_requests( # Build prompts with expected lengths. assert ( len(args.num_tokens_to_prompt) == 2 - and args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] + and + args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] ) max_prompt_length = args.num_tokens_to_prompt[1] max_prompt_text = "hi " * max_prompt_length max_prompt_tokens = tokenizer.tokenize(max_prompt_text) - prompt_lengths = [random.randint(*args.num_tokens_to_prompt) for _ in time_offsets] - prompt_tokens_list = [max_prompt_tokens[:l] for l in prompt_lengths] - prompt_texts = [tokenizer.detokenize(tt) for tt in prompt_tokens_list] + prompt_lengths = [ + random.randint(*args.num_tokens_to_prompt) + for _ in time_offsets + ] + prompt_tokens_list = [ max_prompt_tokens[:l] for l in prompt_lengths ] + prompt_texts = [ tokenizer.detokenize(tt) for tt in prompt_tokens_list ] # Init requests. assert len(prompt_texts) == len(time_offsets) @@ -212,15 +340,16 @@ def get_requests_from_file( # Get time offsets. time_offsets: list[float] = get_time_offsets( - args.seed, args.incoming_requests_per_step, args.incoming_requests_per_sec, len(prompts) + args.seed, + args.incoming_requests_per_step, + args.incoming_requests_per_sec, + len(prompts), ) # Init requests. requests = [ Request(p, t, tokenizer, sp) - for p, t, sp in tqdm( - zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts) - ) + for p, t, sp in tqdm(zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts)) ] return requests @@ -282,21 +411,19 @@ def build_dynamic_engine_setup_prefix( # Prompt description prompt_src_str = ( - "cli" - if args.prompts - else ( - "file" - if args.prompt_file - else f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" - ) + "cli" if args.prompts else + "file" if args.prompt_file else + f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" ) request_str = ( - f"requests: {prompt_src_str}, " f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " + f"requests: {prompt_src_str}, " + f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " ) request_str += ( - f"dur {args.incoming_requests_duration:.1e} " f"r/sec {args.incoming_requests_per_sec:.1e}" - if args.incoming_requests_per_step is None - else f"r/step {args.incoming_requests_per_step}" + f"dur {args.incoming_requests_duration:.1e} " + f"r/sec {args.incoming_requests_per_sec:.1e}" + if args.incoming_requests_per_step is None else + f"r/step {args.incoming_requests_per_step}" ) # Buffer limits config @@ -306,7 +433,14 @@ def build_dynamic_engine_setup_prefix( f"[r {context.max_requests}, t {context.max_tokens}]" ) - parts = [get_model_size_str(model), "dynamic", cg_str, uvm_str, request_str, buffer_limits_str] + parts = [ + get_model_size_str(model), + "dynamic", + cg_str, + uvm_str, + request_str, + buffer_limits_str, + ] return " | ".join(parts) @@ -322,4 +456,4 @@ def get_global_peak_memory_stats_bytes() -> dict: t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64) torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) peak_alloc = int(t[0].item()) - return {"mem-max-allocated-bytes": peak_alloc} + return {"mem-max-allocated-bytes": peak_alloc} \ No newline at end of file diff --git a/examples/rl/README.md b/examples/rl/README.md index 9c2de3ec088..34b6fafa517 100644 --- a/examples/rl/README.md +++ b/examples/rl/README.md @@ -94,7 +94,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/llama3p1_8b_instruct.sh b/examples/rl/model_configs/llama3p1_8b_instruct.sh index 5398dad1a4e..24d285a6cf7 100644 --- a/examples/rl/model_configs/llama3p1_8b_instruct.sh +++ b/examples/rl/model_configs/llama3p1_8b_instruct.sh @@ -77,7 +77,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --add-qkv-bias \ --normalization RMSNorm \ @@ -101,7 +101,6 @@ MODEL_OPTIONS="\ --max-position-embeddings 131072 \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model unsloth/Meta-Llama-3.1-8B-Instruct \ - --legacy-tokenizer \ --langrl-inference-server-type "inplace_megatron_chat" \ --langrl-inference-server-conversation-template "unsloth/Meta-Llama-3.1-8B-Instruct" \ --lr 3e-7 \ diff --git a/examples/rl/model_configs/nemotron5_56b.sh b/examples/rl/model_configs/nemotron5_56b.sh index 741cd054b73..fd2cc4f7212 100644 --- a/examples/rl/model_configs/nemotron5_56b.sh +++ b/examples/rl/model_configs/nemotron5_56b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-format hybrid \ --fp8-amax-history-len 1 \ diff --git a/examples/rl/model_configs/nemotron5_8b.sh b/examples/rl/model_configs/nemotron5_8b.sh index 753d4e493a2..7b8947ae763 100644 --- a/examples/rl/model_configs/nemotron5_8b.sh +++ b/examples/rl/model_configs/nemotron5_8b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \ --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ diff --git a/examples/rl/model_configs/nemotron5p5_12b_H.sh b/examples/rl/model_configs/nemotron5p5_12b_H.sh index adbcc8d03f0..9e97051e087 100644 --- a/examples/rl/model_configs/nemotron5p5_12b_H.sh +++ b/examples/rl/model_configs/nemotron5p5_12b_H.sh @@ -65,7 +65,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-recipe blockwise \ --fp8-format e4m3 \ diff --git a/examples/rl/model_configs/nemotron6_3b_moe.sh b/examples/rl/model_configs/nemotron6_3b_moe.sh index 7d98f4eda63..eff4f6cf0b3 100644 --- a/examples/rl/model_configs/nemotron6_3b_moe.sh +++ b/examples/rl/model_configs/nemotron6_3b_moe.sh @@ -85,7 +85,7 @@ MODEL_OPTIONS="\ --rl-importance-sampling-truncation-coef 10.0 \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --distributed-timeout-minutes 60 \ --use-mcore-models \ diff --git a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh index eb55ba35cc6..775a9587ba4 100644 --- a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh +++ b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh @@ -37,7 +37,7 @@ ENV_DEPENDENT="\ MODEL_OPTIONS=" --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ ---inference-max-requests $MAX_INFERENCE_BS \ +--inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --no-use-tokenizer-model-from-checkpoint-args \ --seq-length 8192 \ diff --git a/examples/rl/model_configs/qwen3_32b.sh b/examples/rl/model_configs/qwen3_32b.sh index c06c5f55b53..cd153a04f3c 100644 --- a/examples/rl/model_configs/qwen3_32b.sh +++ b/examples/rl/model_configs/qwen3_32b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 64 \ diff --git a/examples/rl/model_configs/qwen3_4b.sh b/examples/rl/model_configs/qwen3_4b.sh index 6f6c6b6bf57..da238511fd3 100644 --- a/examples/rl/model_configs/qwen3_4b.sh +++ b/examples/rl/model_configs/qwen3_4b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --num-layers 36 \ --hidden-size 2560 \ diff --git a/examples/rl/model_configs/qwen3_8b.sh b/examples/rl/model_configs/qwen3_8b.sh index 54ff7385331..6758cd84c3d 100644 --- a/examples/rl/model_configs/qwen3_8b.sh +++ b/examples/rl/model_configs/qwen3_8b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 36 \ diff --git a/examples/rl/model_configs/qwen_2p5_32b.sh b/examples/rl/model_configs/qwen_2p5_32b.sh index 2a2a9ae2420..d82972ba477 100644 --- a/examples/rl/model_configs/qwen_2p5_32b.sh +++ b/examples/rl/model_configs/qwen_2p5_32b.sh @@ -59,7 +59,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_3b.sh b/examples/rl/model_configs/qwen_2p5_3b.sh index f3250f39ecc..246afae6ad2 100644 --- a/examples/rl/model_configs/qwen_2p5_3b.sh +++ b/examples/rl/model_configs/qwen_2p5_3b.sh @@ -62,7 +62,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --disable-bias-linear \ --add-qkv-bias \ diff --git a/examples/rl/model_configs/qwen_2p5_distill_7b.sh b/examples/rl/model_configs/qwen_2p5_distill_7b.sh index 1438bca0726..149ac77965f 100644 --- a/examples/rl/model_configs/qwen_2p5_distill_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_distill_7b.sh @@ -44,7 +44,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_math_7b.sh b/examples/rl/model_configs/qwen_2p5_math_7b.sh index b598bb127bd..1d631fa80a5 100644 --- a/examples/rl/model_configs/qwen_2p5_math_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_math_7b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-requests $MAX_INFERENCE_BS \ + --inference-max-batch-size $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py deleted file mode 100644 index 5970b4f14f6..00000000000 --- a/megatron/core/inference/config.py +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -from dataclasses import dataclass -from typing import List, Optional, Tuple - -import torch - -from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_attr_wrapped_model - - -@dataclass -class MambaInferenceStateConfig: - """ - Config for initializing Mamba model inference state tensors. - - Note that we maintain separate metadata for decode, regular prefill, and - chunked prefill requests because the Mamba kernels do not yet support mixing - these. Once the kernels have been updated we can simplify this code. - """ - - layer_type_list: List[str] - """ - A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. - See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. - """ - - mamba_conv_states_shape: Tuple[int] - """Mamba conv states shape per request.""" - - mamba_ssm_states_shape: Tuple[int] - """Mamba ssm states shape per request.""" - - @classmethod - def from_model(cls, model: MegatronModule) -> Optional["MambaInferenceStateConfig"]: - """Returns Mamba inference state config from the model if it is a hybrid model.""" - from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols - - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if layer_type_list is not None and Symbols.MAMBA in layer_type_list: - (mamba_conv_states_shape, mamba_ssm_states_shape) = ( - decoder.mamba_state_shapes_per_request() - ) - return cls( - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, - ) - return None - - -@dataclass -class InferenceConfig: - """ - Config for inference. - - NOTE: Must remain mutually exclusive with the `TransformerConfig`. - """ - - # ================================= - # KV cache config - # ================================= - block_size_tokens: int = 256 - """Size of KV cache block size.""" - - buffer_size_gb: int = 20 - """ - Buffer size reserved on the GPU for the KV cache. - If `unified_memory_level` >= 1, then CPU memory is additionally utilized, resulting in a total - buffer size of `buffer_size_gb + paused_buffer_size_gb`. - """ - - paused_buffer_size_gb: Optional[int] = None - """ - Portion of buffer reserved for paused requests. Active requests are paused when there are not - enough active blocks available to continue generating a request. The total buffer size - (active + paused) depends on `unified_memory_level` (uvm): - - uvm 0: buffer_size_gb (paused buffer is inclusive) - - uvm 1: buffer_size_gb + paused_buffer_size_gb - """ - - max_requests: Optional[int] = None - """ - Max number of active requests to use for decode-only forward passes. - This is primarily limited by the combination of `buffer_size_gb` and `max_sequence_length`. - """ - - max_tokens: Optional[int] = None - """ - Max number of tokens to use for forward passes. This is primarily limited by prefill activation - memory usage. (Defaults to 16384). - """ - - unified_memory_level: int = 0 - """ - Sets unified memory usage within the dynamic inference context. - The levels are: - 0) no unified memory (default) - 1) allocate `memory_buffer` in unified memory. - Eventually, additional levels will be included to control other tensors within the context. - """ - - offload_kv_cache: bool = False - """If True, offload KV cache during RL training.""" - - # ================================= - # CUDA graph config - # ================================= - num_cuda_graphs: Optional[int] = None - """ - Maximum number of cuda graphs to capture, where the cuda graph batch sizes range from 1 to - `max_requests`. Due to rounding, the actual number of cuda graphs may not equal this argument. - """ - - cuda_graph_mixed_prefill_count: Optional[int] = 16 - """ - The number of mixed prefill graphs to capture if mixed prefill/decode graphs are enabled. - """ - - use_cuda_graphs_for_non_decode_steps: bool = True - """ - Whether to use CUDA graphs for non-decode steps. - """ - - persist_cuda_graphs: bool = False - """ - Whether to persist CUDA graphs when the engine is suspended. - If False and `unified_memory_level` is 0, CUDA graphs are deleted on `suspend()` - and re-captured on `resume()` to save memory. - """ - - # ================================= - # Model config - # ================================= - max_sequence_length: int = 2560 - """Max possible sequence length (prompt + output) that will occur.""" - - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None - """The Mamba inference state config if the model is a hybrid model.""" - - pg_collection: Optional[ProcessGroupCollection] = None - """A `ProcessGroupCollection` for distributed execution.""" - - use_flashinfer_fused_rope: Optional[bool] = False - """ - If True, use flashinfer's fused rope implementation. - If None, defaults to using flash-infer if available. - """ - - materialize_only_last_token_logits: bool = True - """ - Whether to only materialize logits for the last token. This should be set to False - if returning log probs. - """ - - # ================================= - # Engine config - # ================================= - enable_chunked_prefill: bool = False - """Whether to enable chunked prefill.""" - - # ================================= - # Logging config - # ================================= - track_paused_request_events: bool = False - """ - Whether to track paused request events. If True, `add_event_pause()` is called on - requests when they are paused during bookkeeping. - """ - - metrics_writer: Optional["WandbModule"] = None - """Wandb module for writing metrics.""" - - logging_step_interval: int = 0 - """ - The step interval at which to log inference metrics to wandb. - Defaults to 0, which means no logging. - """ - - request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None - """ - A list of the per-request metadata types to track. Each entry is a tuple - consisting of the string label, the target dtype, and whether to store the data on GPU. - """ diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py index 13179483f59..6cf45aeb9e1 100644 --- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py +++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py @@ -1,12 +1,36 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from typing import Optional +from dataclasses import dataclass +from typing import List, Optional, Tuple import torch from megatron.core.inference.batch_dimensions_utils import InferenceBatchDimensions +@dataclass +class MambaInferenceStateConfig: + """ + Config for initializing Mamba model inference state tensors. + + Note that we maintain separate metadata for decode, regular prefill, and + chunked prefill requests because the Mamba kernels do not yet support mixing + these. Once the kernels have been updated we can simplify this code. + """ + + layer_type_list: List[str] + """ + A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. + See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. + """ + + mamba_conv_states_shape: Tuple[int] + """Mamba conv states shape per request.""" + + mamba_ssm_states_shape: Tuple[int] + """Mamba ssm states shape per request.""" + + class MambaMetadata: """Manages the metadata tensors required for Mamba layers during inference.""" diff --git a/megatron/core/inference/contexts/base_context.py b/megatron/core/inference/contexts/base_context.py index 4f03726fe3d..3dfec6de3ad 100644 --- a/megatron/core/inference/contexts/base_context.py +++ b/megatron/core/inference/contexts/base_context.py @@ -2,8 +2,6 @@ import abc -from megatron.core.inference.config import InferenceConfig - class BaseInferenceContext(abc.ABC): """Base class for inference contexts. @@ -12,11 +10,13 @@ class BaseInferenceContext(abc.ABC): Extend this class for any future contexts types. """ - def __init__(self, inference_config: InferenceConfig): + def __init__(self, materialize_only_last_token_logits: bool): """ Args: + materialize_only_last_token_logits (bool): + If True, only the last-token logits will be extracted during decode """ - self.config = inference_config + self.materialize_only_last_token_logits = materialize_only_last_token_logits @abc.abstractmethod def is_static_batching(self) -> bool: diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index 915180a5ca2..5dc2d503097 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -4,19 +4,22 @@ import math import warnings from contextlib import nullcontext -from typing import List, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple -import torch # type: ignore -import torch.nn.functional as F # type: ignore -from torch import Tensor # type: ignore +import torch +import torch.nn.functional as F +from packaging.version import Version as PkgVersion +from torch import Tensor from megatron.core import parallel_state from megatron.core.inference.batch_dimensions_utils import ( CUDAGraphBatchDimensionBuilder, InferenceBatchDimensions, ) -from megatron.core.inference.config import InferenceConfig from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.unified_memory import ( UnifiedMemoryUnsupportedError, @@ -25,13 +28,13 @@ from megatron.core.inference.utils import tensor_swap from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb from megatron.core.package_info import __version__ as mcore_version +from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list -from megatron.core.transformer import MLATransformerConfig, TransformerConfig -from megatron.core.utils import deprecate_args +from megatron.core.transformer import TransformerConfig from megatron.core.utils import divide as core_divide -from megatron.core.utils import get_pg_size, internal_api +from megatron.core.utils import get_attr_wrapped_model, get_pg_size, internal_api -from .attention_context.mamba_metadata import MambaMetadata +from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata from .base_context import BaseInferenceContext from .dynamic_block_allocator import BlockAllocator @@ -42,7 +45,14 @@ triton_append_key_value_cache = None try: - import flashinfer # type: ignore # pylint: disable=unused-import + from packaging.version import Version as PkgVersion + + HAVE_PACKAGING = True +except: + HAVE_PACKAGING = False + +try: + import flashinfer # pylint: disable=unused-import HAVE_FLASHINFER = True except ImportError: @@ -56,36 +66,16 @@ except ImportError: HAVE_TORCH_MEMORY_SAVER = False -DEPRECATED_ARGS = [ - "params_dtype", - "num_layers", - "kv_channels", - "num_attention_heads", - "max_sequence_length", - "buffer_size_gb", - "paused_buffer_size_gb", - "max_requests", - "max_tokens", - "block_size_tokens", - "tensor_model_parallel_size", - "pipeline_model_parallel_size", - "pg_collection", - "cache_mla_latent", - "kv_lora_rank", - "qk_pos_emb_head_dim", - "num_cuda_graphs", - "materialize_only_last_token_logits", - "mamba_inference_state_config", - "use_cuda_graphs_for_non_decode_steps", - "use_flashinfer_fused_rope", - "unified_memory_level", - "cuda_graph_max_tokens", - "cuda_graph_mixed_prefill_count", - "metrics_writer", - "request_metadata_types", - "persist_cuda_graphs", - "offload_kv_cache", -] +try: + import wandb # pylint: disable=unused-import + + HAVE_WANDB = True +except ImportError: + HAVE_WANDB = False + wandb = None + +if TYPE_CHECKING: + import wandb as WandbModule class ContextOverflowError(Exception): @@ -223,45 +213,130 @@ class DynamicInferenceContext(BaseInferenceContext): given step, any unassigned blocks equate to unused space. Args: - model_config (TransformerConfig): Model config. - inference_config (InferenceConfig): Inference config. + params_dtype (torch.dtype): Dtype used for KV cache. + num_layers (int): Number of layers on this pipeline parallel rank. + kv_channels (int): Hidden dimension per attention head. + num_attention_heads (int): Number of attention heads. + max_sequence_length (int): Max possible sequence length (prompt + output) + that will occur. + buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache. + if `unified_memory_level` >= 1, then CPU memory is additionally + utilized, resulting in a total buffer size of `buffer_size_gb + + paused_buffer_size_gb`. + paused_buffer_size_gb (float | None): Portion of buffer reserved for + paused requests. Active requests are paused when there are not enough + active blocks available to continue generating a request. The total + buffer size (active + paused) depends on `unified_memory_level` (uvm): + - uvm 0: buffer_size_gb (paused buffer is inclusive) + - uvm 1: buffer_size_gb + paused_buffer_size_gb + max_requests (int): Max number of active requests to use for + decode-only forward passes. This value is primarily limited by the + combination of `buffer_size_gb` and `max_sequence_length`. + max_tokens (int): Max number of tokens to use for forward passes. This is + primarily limited by prefill activation memory usage. (Defaults to + 16384). + block_size_tokens (int): Size of KV cache block size. + tensor_model_parallel_size (Optional[int]): Tensor model parallel size. + num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture, + where the cuda graph batch sizes range from 1 to `max_requests` + (as computed below). Due to rounding, the actual number of cuda graphs + may not equal this argument. + materialize_only_last_token_logits (Optional[bool]): Whether to only + materialize logits for the last token. This should be set to False + if returning log probs. + mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba + inference state config if the model is a hybrid model. + use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode + engine steps. + unified_memory_level (Optional[int]): Set unified memory usage within the + dynamic inference context. The levels are: 0) no unified memory, 1) + allocate `memory_buffer` in unified memory. Eventually, additional + levels will be included to control other tensors within the context. + use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation. + If None, defaults to using flash-infer if available. + metrics_writer (Optional['WandbModule']): Wandb module for writing metrics. + request_metadata_types (Optional[List[Tuple[str, torch.dtype, bool]]]): A list of the + per-request metadata types to track. Each entry is a tuple consisting of the string + label, the target dtype, and whether to store the data on GPU. """ DEFAULT_MAX_TOKENS = 16384 TOKEN_ROUNDER = 64 REQUEST_ROUNDER = 4 - @deprecate_args( - *DEPRECATED_ARGS, - message=( - "Argument `{name}` has been deprecated. " - "Only pass `model_config` and `inference_config`" - ), - ) - def __init__(self, model_config: TransformerConfig, inference_config: InferenceConfig): - super().__init__(inference_config=inference_config) - - self.cache_mla_latent = ( - isinstance(model_config, MLATransformerConfig) and model_config.cache_mla_latents - ) + def __init__( + self, + *, + params_dtype: torch.dtype, + num_layers: int, + kv_channels: int, + num_attention_heads: int, + max_sequence_length: int, + buffer_size_gb: float, + paused_buffer_size_gb: float | None = None, + max_requests: int = None, + max_tokens: int = DEFAULT_MAX_TOKENS, + block_size_tokens: int = 256, + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + pg_collection: Optional[ProcessGroupCollection] = None, + cache_mla_latent: bool = False, + kv_lora_rank: Optional[int] = None, + qk_pos_emb_head_dim: Optional[int] = None, + num_cuda_graphs: Optional[int] = None, + materialize_only_last_token_logits: Optional[bool] = True, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, + use_cuda_graphs_for_non_decode_steps: bool = True, + use_flashinfer_fused_rope: bool = False, + unified_memory_level: Optional[int] = 0, + cuda_graph_max_tokens: Optional[int] = None, + cuda_graph_mixed_prefill_count: Optional[int] = 16, + metrics_writer: Optional['WandbModule'] = None, + request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None, + persist_cuda_graphs: Optional[bool] = False, + offload_kv_cache: Optional[bool] = False, + ): + super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits) + + self.cache_mla_latent = cache_mla_latent if self.cache_mla_latent: assert ( - inference_config.block_size_tokens == 64 + block_size_tokens == 64 ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert" + # give deprecated args warning for cuda_graph_max_tokens + if cuda_graph_max_tokens is not None: + warnings.warn( + "`cuda_graph_max_tokens` is deprecated and will be removed in a future release. " + "The context now automatically sets the max tokens for cuda graphs based on " + "`max_requests`.", + DeprecationWarning, + ) + + self.metrics_writer = metrics_writer + # Per partition num heads and hidden size. - num_attention_heads = model_config.num_query_groups or model_config.num_attention_heads - projection_size = model_config.kv_channels * num_attention_heads - pg_collection = inference_config.pg_collection - if pg_collection is not None: - tp_size = get_pg_size(pg_collection.tp) - pp_size = get_pg_size(pg_collection.pp) + projection_size = kv_channels * num_attention_heads + if tensor_model_parallel_size is None: + tp_size = ( + get_pg_size(pg_collection.tp) + if pg_collection is not None + else parallel_state.get_tensor_model_parallel_world_size() + ) else: - tp_size = model_config.tensor_model_parallel_size - pp_size = model_config.pipeline_model_parallel_size + tp_size = tensor_model_parallel_size self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) + if pipeline_model_parallel_size is None: + pp_size = ( + get_pg_size(pg_collection.pp) + if pg_collection is not None + else parallel_state.get_pipeline_model_parallel_world_size() + ) + else: + pp_size = pipeline_model_parallel_size + # Cache the PP group we should use for PP collectives inside the context. # If the model provides a pg_collection with a pp group, prefer it. # Otherwise: @@ -282,7 +357,6 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC self.expert_model_parallel_group = None # Mamba states. - mamba_inference_state_config = inference_config.mamba_inference_state_config self.is_hybrid_model = mamba_inference_state_config is not None if self.is_hybrid_model: mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape @@ -307,7 +381,7 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC self.layer_map = attention_layer_map | mamba_layer_map else: # The layer map is the identity function for pure Transformer models. - self.num_attention_layers = model_config.num_layers // pp_size + self.num_attention_layers = num_layers self.num_mamba_layers = 0 (self.mamba_conv_states_shape, self.mamba_ssm_states_shape) = (None, None) self.layer_map = {i: i for i in range(self.num_attention_layers)} @@ -318,11 +392,11 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC ) # Block size tokens, bytes. - dtype_size_bytes = model_config.params_dtype.itemsize - self.block_size_tokens = inference_config.block_size_tokens + dtype_size_bytes = params_dtype.itemsize + self.block_size_tokens = block_size_tokens if self.cache_mla_latent: # one vector c_t (rank) + optional RoPE phase slice - self.kv_reduced_dim = model_config.kv_lora_rank + model_config.qk_pos_emb_head_dim + self.kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim self.block_size_bytes = ( dtype_size_bytes * self.num_attention_layers @@ -348,9 +422,9 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC mamba_states_memory_per_request *= dtype_size_bytes # Unified memory. - self.unified_memory_level = inference_config.unified_memory_level - self.persist_cuda_graphs = inference_config.persist_cuda_graphs - if self.unified_memory_level > 0: + self.unified_memory_level = unified_memory_level + self.persist_cuda_graphs = persist_cuda_graphs + if unified_memory_level > 0: try: self.unified_memory_mempool = create_unified_mempool() except UnifiedMemoryUnsupportedError: @@ -361,11 +435,9 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC self.unified_memory_level = 0 # Initialize block allocator. - buffer_size_bytes = int(inference_config.buffer_size_gb * 1024**3) + buffer_size_bytes = int(buffer_size_gb * 1024**3) paused_buffer_size_bytes = ( - 0 - if inference_config.paused_buffer_size_gb is None - else int(inference_config.paused_buffer_size_gb * 1024**3) + 0 if paused_buffer_size_gb is None else int(paused_buffer_size_gb * 1024**3) ) # TODO: Add parameter to control fraction of memory assigned to KV cache # versus Mamba state. @@ -399,14 +471,13 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC ) # Track request metadata. - request_metadata_types = inference_config.request_metadata_types if request_metadata_types is None: request_metadata_types = DynamicInferenceRequest.get_metadata_types() self.request_metadata_types = request_metadata_types # Initialize context state. - self.params_dtype = model_config.params_dtype - self.max_sequence_length = inference_config.max_sequence_length + self.params_dtype = params_dtype + self.max_sequence_length = max_sequence_length # Request and token counts. self.total_request_count = 0 @@ -426,16 +497,16 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens) # Set max_requests, max_tokens. - if inference_config.max_requests is None: + if max_requests is None: # Maximize compute utilization by defaulting to 1 block per request. self.max_requests = self.block_allocator.total_count - 1 # -1 for dummy block self.max_requests = self.max_requests // tp_size * tp_size self.max_requests = self.max_requests // self.REQUEST_ROUNDER * self.REQUEST_ROUNDER else: # User can control request overflow via max_requests. - self.max_requests = inference_config.max_requests + self.max_requests = max_requests - self.max_tokens = inference_config.max_tokens or self.DEFAULT_MAX_TOKENS + self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS assert self.max_tokens >= self.max_requests, ( f"max_tokens ({self.max_tokens}) must be >= " @@ -467,39 +538,37 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC ) # CUDA graph config list - self.use_cuda_graphs_for_non_decode_steps = ( - inference_config.use_cuda_graphs_for_non_decode_steps - ) self.cuda_graph_batch_dimensions_list, self.cuda_graph_token_counts = ( CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list( tp_size=tp_size, - num_cuda_graphs=inference_config.num_cuda_graphs, + num_cuda_graphs=num_cuda_graphs, cuda_graph_max_tokens=self.max_requests, - cuda_graph_mixed_prefill_count=inference_config.cuda_graph_mixed_prefill_count, + cuda_graph_mixed_prefill_count=cuda_graph_mixed_prefill_count, max_requests=self.max_requests, max_tokens=self.max_tokens, max_sequence_length=self.max_sequence_length, - use_cuda_graphs_for_non_decode_steps=self.use_cuda_graphs_for_non_decode_steps, + use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps, ) ) # Whether to offload the KV cache. Determines where the KV cache is allocated within memory. - self.offload_kv_cache = inference_config.offload_kv_cache + self.offload_kv_cache = offload_kv_cache assert not ( self.offload_kv_cache and self.unified_memory_level ), "The KV cache should not be instantiated in unified memory when it is offloaded during training." self._using_cuda_graph_this_step = False + self.use_cuda_graphs_for_non_decode_steps = use_cuda_graphs_for_non_decode_steps # Deal with chunked prefill self.chunked_prefill_request_id = -1 self.has_explicit_chunked_prefill_req = False # FlashInfer. - if inference_config.use_flashinfer_fused_rope is True: + if use_flashinfer_fused_rope is True: assert HAVE_FLASHINFER, "flashinfer is not installed" - elif inference_config.use_flashinfer_fused_rope is None: - inference_config.use_flashinfer_fused_rope = HAVE_FLASHINFER - self.use_flashinfer_fused_rope = inference_config.use_flashinfer_fused_rope + elif use_flashinfer_fused_rope is None: + use_flashinfer_fused_rope = HAVE_FLASHINFER + self.use_flashinfer_fused_rope = use_flashinfer_fused_rope # Allocate GPU state. self.is_tensor_state_allocated = False @@ -687,7 +756,14 @@ def deallocate_all_tensors(self): @classmethod def round_up_tokens(cls, value, tp_size=None): - """Round up to nearest multiple of `TOKEN_ROUNDER` that is also divisible by tensor model parallel size.""" + """Round up to nearest multiple of `TOKEN_ROUNDER` (above) that is also divisible by tensor model parallel size.""" + if not HAVE_PACKAGING: + raise ImportError( + "`packaging` is required for this functionality, please install it with `pip install packaging`" + ) + if PkgVersion(mcore_version) < PkgVersion("0.13"): + return cls.round_up(value) + # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -699,9 +775,72 @@ def round_up_tokens(cls, value, tp_size=None): return token_rounder * int(math.ceil(int(value) / token_rounder)) + @classmethod + def from_config( + cls, + inference_config: InferenceWrapperConfig, + model, + max_batch_size: int, + buffer_size_gb: float = 40, + num_cuda_graphs: int = None, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, + unified_memory_level: int = 0, + ): + """ + Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`. + """ + # TODO: Add other necessary configs from inference_config + + # Max sequence length. + position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") + model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") + inf_max_seq_len = inference_config.inference_max_seq_length + + if position_embedding_type == "learned_absolute": + # When using absolute position embeddings, it is critical that the + # context's `max_sequence_length` is less than or equal to the model's + # `max_sequence_length`. Otherwise, the context's `position_ids` will + # contain ids greater than the dimension of the position embedding + # tensor, which will result in an index error. + if inf_max_seq_len: + max_sequence_length = min(model_max_seq_len, inf_max_seq_len) + else: + max_sequence_length = model_max_seq_len + assert max_batch_size <= model_max_seq_len + else: + max_sequence_length = ( + inference_config.inference_max_seq_length or model_config.max_sequence_length + ) + max_sequence_length = max(max_sequence_length, max_batch_size) + + # Context. + model_config = model.config + return cls( + params_dtype=inference_config.params_dtype, + num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size, + kv_channels=model_config.kv_channels, + num_attention_heads=model_config.num_query_groups, + tensor_model_parallel_size=model_config.tensor_model_parallel_size, + pipeline_model_parallel_size=model_config.pipeline_model_parallel_size, + max_sequence_length=max_sequence_length, + buffer_size_gb=buffer_size_gb, + materialize_only_last_token_logits=False, + num_cuda_graphs=num_cuda_graphs, + use_flashinfer_fused_rope=None, + mamba_inference_state_config=mamba_inference_state_config, + unified_memory_level=unified_memory_level, + ) + @classmethod def round_up_requests(cls, value, tp_size=None): - """Round up to nearest multiple of `REQUEST_ROUNDER` that is also divisible by tensor model parallel size.""" + """Round up to nearest multiple of `REQUEST_ROUNDER` (above) that is also divisible by tensor model parallel size.""" + if not HAVE_PACKAGING: + raise ImportError( + "`packaging` is required for this functionality, please install it with `pip install packaging`" + ) + if PkgVersion(mcore_version) < PkgVersion("0.13"): + return cls.round_up(value) + # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -713,6 +852,16 @@ def round_up_requests(cls, value, tp_size=None): return request_rounder * int(math.ceil(int(value) / request_rounder)) + @classmethod + def round_up(cls, value): + """Deprecated in favor of round_up_tokens and round_up_requests.""" + warnings.warn( + "`round_up` is deprecated in favor of `round_up_tokens` or `round_up_requests` " + "and will be removed in `megatron-core` 0.14." + ) + ROUNDER = getattr(cls, "ROUNDER", 64) + return ROUNDER * int(math.ceil(int(value) / ROUNDER)) + def is_static_batching(self) -> bool: """Is static batching? False.""" return False @@ -733,7 +882,6 @@ def has_unfinished_requests(self) -> bool: def cu_query_lengths(self) -> Tuple[Tensor, int]: """Cumulative query sequence lengths.""" - assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"], @@ -741,7 +889,6 @@ def cu_query_lengths(self) -> Tuple[Tensor, int]: def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]: """Cumulative key/value sequence lengths.""" - assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"], @@ -811,20 +958,18 @@ def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) : self.padded_active_token_count ] - def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional[Tensor], Tensor]: + def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]: """Read from KV cache. Args: layer_number (int): Layer number. Return: - (Tuple[Tensor, Tensor, Tensor]) The key and value pointer tensors that point - to blocks within the block-level memory buffer as well as the block table. + (Tuple[Tensor, Tensor]) The key and value pointer tensors that point + to blocks within the block-level memory buffer. """ attention_layer_number = self.layer_map[layer_number - 1] - assert self.active_attn_metadata is not None - if self.cache_mla_latent: return ( self.memory_buffer[attention_layer_number], @@ -1241,9 +1386,9 @@ def initialize_attention_state( ] = 0 self.active_attn_metadata = ( - self.graph_attn_metadata # type: ignore[assignment] + self.graph_attn_metadata if self.using_cuda_graph_this_step() - else self.non_graph_attn_metadata # type: ignore[assignment] + else self.non_graph_attn_metadata ) # Update cu_query_seq_lengths, max_seqlen_q. @@ -1268,7 +1413,6 @@ def initialize_attention_state( has_explicit_chunked_prefill_req=False, ) - assert self.active_attn_metadata is not None self.active_attn_metadata["mha_metadata"].update( request_query_lengths=query_lengths_view, request_kv_length_offsets=request_kv_length_offsets_view, @@ -1401,7 +1545,7 @@ def last_token_logits(self, logits: Tensor) -> Tensor: return last_token_logits - def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bool, bool, bool]: + def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool): """ Check if the request can be added to the context. """ @@ -1640,7 +1784,7 @@ def resume_paused_requests( active_request_count: int, newly_paused_request_ids: torch.Tensor, next_tokens: torch.Tensor, - ) -> tuple[int, torch.Tensor]: + ) -> tuple[int, int, torch.Tensor]: """Resume as many paused requests as we have space for in the active buffer. Args: @@ -1719,7 +1863,7 @@ def resume_paused_requests( def evict_overflow_paused_requests( self, active_request_count: int, next_tokens: torch.Tensor - ) -> Optional[tuple[torch.Tensor, torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor]: """Evict requests that overflow the paused buffer. Args: diff --git a/megatron/core/inference/contexts/static_context.py b/megatron/core/inference/contexts/static_context.py index a15b33c414a..8c83d2f09b3 100644 --- a/megatron/core/inference/contexts/static_context.py +++ b/megatron/core/inference/contexts/static_context.py @@ -1,6 +1,8 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from megatron.core.inference.config import InferenceConfig +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from .base_context import BaseInferenceContext @@ -17,8 +19,7 @@ class StaticInferenceContext(BaseInferenceContext): def __init__( self, max_batch_size: int, max_sequence_length: int, use_flashinfer_fused_rope: bool = None ): - config = InferenceConfig(materialize_only_last_token_logits=True) - super().__init__(inference_config=config) + super().__init__(materialize_only_last_token_logits=True) self.max_sequence_length = max_sequence_length self.max_batch_size = max_batch_size self.sequence_len_offset = 0 @@ -26,6 +27,13 @@ def __init__( self.key_value_memory_dict = {} self.decode_mode = False + @classmethod + def from_config(cls, config: InferenceWrapperConfig) -> "StaticInferenceContext": + """Initialize context from a config.""" + max_batch_size = config.inference_max_requests + max_sequence_length = config.inference_max_seq_length + return cls(max_batch_size, max_sequence_length) + def swap_key_value_dict(self, batch_idx): "swap between batches" if len(self.key_value_memory_dict) == 0: diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 882db6b3a6a..0a95e8f4a53 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -42,7 +42,6 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import delete_cuda_graphs from megatron.core.utils import ( - deprecate_args, experimental_api, get_asyncio_loop, get_pg_rank, @@ -90,14 +89,6 @@ except ImportError: HAVE_PSUTIL = False -DEPRECATED_ARGS = [ - "enable_cuda_graph", - "random_seed", - "track_paused_request_events", - "enable_chunked_prefill", - "inference_logging_step_interval", - "pg_collection", -] from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER if HAVE_TORCH_MEMORY_SAVER: @@ -145,13 +136,24 @@ class DynamicInferenceEngine(AbstractEngine): outputs and detokenizer the output tokens. inference_context (DynamicInferenceContext): Context for managing in-flight batching and a dynamic block-level KV cache (similar to paged attention). + random_seed (Optional[int]): Use a random seed if you want deterministic + results. Defaults to None. + inference_logging_step_interval (int): The step interval at which to log + inference metrics to wandb. Defaults to 0, which means no logging. """ - @deprecate_args( - *DEPRECATED_ARGS, - message="Argument `{name}` has been deprecated. Only pass `controller` and `context`", - ) - def __init__(self, controller: TextGenerationController, context: DynamicInferenceContext): + def __init__( + self, + controller: TextGenerationController, + context: DynamicInferenceContext, + enable_cuda_graph: Optional[bool] = None, + random_seed: Optional[int] = None, + *, + track_paused_request_events: bool = False, + enable_chunked_prefill: bool = True, + inference_logging_step_interval: int = 0, + pg_collection: Optional[ProcessGroupCollection] = None, + ): assert isinstance( controller, TextGenerationController @@ -159,28 +161,40 @@ def __init__(self, controller: TextGenerationController, context: DynamicInferen assert isinstance( context, DynamicInferenceContext ), f"context must be a DynamicInferenceContext, got {type(context)}" + assert isinstance(random_seed, int), f"random_seed must be an int, got {type(random_seed)}" + + # Deprecate `enable_cuda_graph`. + if enable_cuda_graph is not None: + warnings.warn( + "The `enable_cuda_graph` argument is deprecated and will be " + "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " + "read directly from the transformer config object." + ) + self.enable_cuda_graph = enable_cuda_graph + else: + self.enable_cuda_graph = ( + controller.inference_wrapped_model.model.config.enable_cuda_graph + ) - model_config = controller.inference_wrapped_model.model.config - inference_config = context.config - - if inference_config.pg_collection is not None: - self.pg_collection = inference_config.pg_collection + if pg_collection is not None: + self.pg_collection = pg_collection else: self.pg_collection = ProcessGroupCollection.use_mpu_process_groups() # Initialization options. self.controller = controller self.context = context - self.track_paused_request_events = inference_config.track_paused_request_events - self.enable_chunked_prefill = inference_config.enable_chunked_prefill - self.metrics_writer = inference_config.metrics_writer - self.logging_step_interval = inference_config.logging_step_interval - self.unified_memory_level = inference_config.unified_memory_level - self.persist_cuda_graphs = inference_config.persist_cuda_graphs - self.materialize_only_last_token_logits = ( - inference_config.materialize_only_last_token_logits - ) - self.cuda_graph_impl = model_config.cuda_graph_impl + self.random_seed = random_seed + self.track_paused_request_events = track_paused_request_events + self.enable_chunked_prefill = enable_chunked_prefill + self.inference_logging_step_interval = inference_logging_step_interval + self.unified_memory_level = context.unified_memory_level + self.persist_cuda_graphs = context.persist_cuda_graphs + + if enable_cuda_graph is not None: + self.cuda_graph_impl = "local" if enable_cuda_graph else "none" + else: + self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl # Initialize engine. self.reset() @@ -191,12 +205,12 @@ def __init__(self, controller: TextGenerationController, context: DynamicInferen ) # Configure wandb to use separate step counter for inference metrics (only once) - if self.logging_step_interval > 0 and self.metrics_writer is not None: + if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None: logging.info( f"\033[1;93m[INFERENCE]\033[0m " f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m" ) - if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": + if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": # Make all inference/* metrics use inference_step as their x-axis # This allows inference and training to have independent step counters context.metrics_writer.define_metric( @@ -274,6 +288,8 @@ def create_cuda_graphs(self, reset_context: bool = True): context = self.context controller = self.controller + config = controller.inference_wrapped_model.inference_wrapper_config + time_start = time.time() mem_stats_start = torch.cuda.memory_stats() @@ -717,7 +733,7 @@ def _add_request( request.sampling_params.return_log_probs and not request.sampling_params.skip_prompt_log_probs ): - assert not self.materialize_only_last_token_logits, ( + assert not self.context.materialize_only_last_token_logits, ( "Prompt log probs cannot be calculated if only last token logits are materialized. " "Set materialize_only_last_token_logits to False in DynamicInferenceContext " "or skip_prompt_log_probs to True in SamplingParams." @@ -906,7 +922,7 @@ def post_process_requests( # For chunked prefill with materialize_only_last_token_logits, discard intermediate log probs if ( request_id == self.context.chunked_prefill_request_id - and self.materialize_only_last_token_logits + and self.context.materialize_only_last_token_logits ): request.prompt_log_probs = [] request.generated_log_probs = [] @@ -1186,10 +1202,10 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]: range_pop() if ( - self.logging_step_interval > 0 + self.inference_logging_step_interval > 0 and self.step_count > 0 - and self.step_count % self.logging_step_interval == 0 - and self.metrics_writer is not None + and self.step_count % self.inference_logging_step_interval == 0 + and self.context.metrics_writer is not None ): kvcache_util_stats = self.context.get_kvcache_utilization_stats() else: @@ -1322,13 +1338,18 @@ async def async_bookkeep( else: metrics[f'inference/{key}'] = value - if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": - self.metrics_writer.log(metrics, commit=True) + if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": + self.context.metrics_writer.log(metrics, commit=True) else: - raise ValueError(f"Unsupported metrics writer type: {type(self.metrics_writer)}") + raise ValueError( + f"Unsupported metrics writer type: {type(self.context.metrics_writer)}" + ) # Print context state. - if self.logging_step_interval > 0 and step_count % self.logging_step_interval == 0: + if ( + self.inference_logging_step_interval > 0 + and step_count % self.inference_logging_step_interval == 0 + ): mem = torch.cuda.memory_stats() step_type = "decode" if context_state["is_decode_only"] else "non-decode" output_str = ( diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py index 5ae37d5967e..d4c61965d2b 100644 --- a/megatron/core/inference/engines/static_engine.py +++ b/megatron/core/inference/engines/static_engine.py @@ -8,8 +8,7 @@ import torch from megatron.core.inference.async_stream import AsyncStream -from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig -from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.inference_request import InferenceRequest @@ -18,7 +17,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.utils import get_asyncio_loop +from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model try: from tqdm import tqdm @@ -43,6 +42,8 @@ class StaticInferenceEngine(AbstractEngine): controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. max_batch_size (int, optional): The maximum number of requests to process at once. + Will be set from the InferenceWrapperConfig in `text_generation_controller` by + default. random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None. """ @@ -68,55 +69,53 @@ def __init__( DeprecationWarning, ) + inference_wrapper_config = ( + text_generation_controller.inference_wrapped_model.inference_wrapper_config + ) self.controller = text_generation_controller - self.inference_wrapped_model = self.controller.inference_wrapped_model - self.config = self.inference_wrapped_model.config self.random_seed = random_seed or 1234 - # Store original context in case we need to fall back to legacy static engine - original_context = self.inference_wrapped_model.inference_context - assert original_context is not None - assert isinstance(original_context, StaticInferenceContext) - + inference_max_batch_size = inference_wrapper_config.inference_max_requests if max_batch_size is None: - max_batch_size = original_context.max_batch_size - elif max_batch_size > original_context.max_batch_size: + max_batch_size = inference_max_batch_size + elif max_batch_size > inference_max_batch_size: warnings.warn( f"Engine `max_batch_size` ({max_batch_size}) > " - f"`context.max_batch_size` in `inference_wrapped_model.inference_context` " - f"({original_context.max_batch_size}); setting `max_batch_size` to " - f"{original_context.max_batch_size}", + f"`inference_max_requests` in `inference_wrapper_config` " + f"({inference_max_batch_size}); setting `max_batch_size` to " + f"{inference_max_batch_size}", UserWarning, ) - max_batch_size = original_context.max_batch_size + max_batch_size = inference_max_batch_size self.scheduler = Scheduler(max_batch_size=max_batch_size) - mamba_inference_state_config = MambaInferenceStateConfig.from_model( - self.inference_wrapped_model.model + # Store original context in case we need to fall back to legacy static engine + original_context = text_generation_controller.inference_wrapped_model.inference_context + + mamba_inference_state_config = get_mamba_inference_state_config_from_model( + text_generation_controller.inference_wrapped_model.model ) try: if not legacy: - dynamic_context = DynamicInferenceContext( - model_config=self.config, - inference_config=InferenceConfig( - max_sequence_length=original_context.max_sequence_length, - buffer_size_gb=buffer_size_gb, - mamba_inference_state_config=mamba_inference_state_config, - max_requests=max_batch_size, - num_cuda_graphs=1, - block_size_tokens=256, - unified_memory_level=0, - ), + dynamic_context = DynamicInferenceContext.from_config( + inference_config=inference_wrapper_config, + model=text_generation_controller.inference_wrapped_model.model, + max_batch_size=max_batch_size, + buffer_size_gb=buffer_size_gb, + num_cuda_graphs=1, + mamba_inference_state_config=mamba_inference_state_config, ) - self.controller.inference_wrapped_model.inference_context = dynamic_context self.controller.inference_wrapped_model.prep_model_for_inference() self.controller._init_dynamic_sampling_tensors() self.dynamic_engine = DynamicInferenceEngine( - controller=self.controller, context=dynamic_context + controller=self.controller, + random_seed=self.random_seed, + context=dynamic_context, + enable_cuda_graph=True, ) except Exception as e: # Get exception details for better debugging @@ -230,20 +229,13 @@ def generate_using_dynamic_engine( if prompts: if add_BOS: sampling_params.add_BOS = True - request_records = self.dynamic_engine.generate( - prompts=prompts, sampling_params=sampling_params - ) + return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) elif inference_requests: prompts = [request.prompt for request in inference_requests] sampling_params = inference_requests[0].sampling_params if add_BOS: sampling_params.add_BOS = True - request_records = self.dynamic_engine.generate( - prompts=prompts, sampling_params=sampling_params - ) - - # Return the underlying `InferenceRequest` objects from the `DynamicInferenceRequestRecord`s. - return [record.merge() for record in request_records] + return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) def generate_using_legacy_static_engine( self, diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py index 6ef5ac3a2e5..6a17de685bf 100644 --- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py @@ -1,6 +1,8 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import abc +import math +import warnings from typing import Any, Dict, Iterable, Optional, Union import torch @@ -13,22 +15,27 @@ send_to_next_pipeline_rank, ) from megatron.core.inference.contexts import BaseInferenceContext +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.utils import get_attr_wrapped_model, get_model_config +# pylint: disable=line-too-long class AbstractModelInferenceWrapper(abc.ABC): """Abstract inference wrapper Extend this to create a version for your model. - The wrapper prepares the model for inference, provides the required input data and - runs the forward pass. + The wrapper prepares the model for inference, provides the required input data and runs the forward pass. Args: model (Union[GPTModel, LegacyGPTModel]): The actual GPT model (MCore or MLM). + inference_wrapper_config (InferenceWrapperConfig): Has info like + hidden size, vocab size etc. inference_context (BaseInferenceContext): Context for managing KV cache and other inference params. pg_collection (ProcessGroupCollection): Process groups for model communication. @@ -37,18 +44,30 @@ class AbstractModelInferenceWrapper(abc.ABC): def __init__( self, model: Union['LegacyGPTModel', GPTModel], # type: ignore[name-defined] - inference_context: BaseInferenceContext, + inference_wrapper_config: InferenceWrapperConfig, + inference_context: Optional[BaseInferenceContext] = None, pg_collection: Optional[ProcessGroupCollection] = None, ): assert not isinstance( model, Iterable ), 'interleaving schedule is not supported for inference' self.model = model - self.config = get_model_config(self.model) + self.inference_wrapper_config = inference_wrapper_config self.pipeline_communication_dtype = ( - torch.float if self.config.fp32_residual_connection else self.config.params_dtype + torch.float + if self.inference_wrapper_config.fp32_residual_connection + else self.inference_wrapper_config.params_dtype ) - self.sequence_parallel = self.config.sequence_parallel + model_config = get_model_config(self.model) + self.sequence_parallel = model_config.sequence_parallel + + if inference_context is None: + warnings.warn( + "`inference_context` must be passed in as an argument starting in `megatron-core` 0.13." + ) + from megatron.core.inference.contexts import StaticInferenceContext + + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) self.inference_context = inference_context @@ -59,18 +78,40 @@ def __init__( self.pp_group = pg_collection.pp self.tp_size = torch.distributed.get_world_size(self.tp_group) - if self.config.fp8 is not None: + if self.inference_wrapper_config.fp8 is not None: self.model = prepare_model_for_fp8_inference(self.model) - # TODO(ksanthanam): Add support for fp4 + @property + def inference_params(self): + """Getter for deprecated `inference_params`.""" + warnings.warn( + "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." + ) + return self.inference_context - def prep_model_for_inference(self): + @inference_params.setter + def inference_params(self, value): + """Setter for deprecated `inference_params`.""" + warnings.warn( + "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." + ) + self.inference_context = value + + def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tensor] = None): """A utility function for preparing model for inference The function gets called once before the auto regressive inference loop. It puts the model in eval mode. + Args: + prompts_tokens (torch.Tensor, optional): Deprecated, will be removed in `megatron-core` 0.13 """ + if prompts_tokens is not None: + warnings.warn( + "Passing `prompts_tokens` is deprecated and this argument will be ignored." + "This parameter will be removed in `megatron-core` 0.13." + ) + self.model.eval() # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True @@ -96,9 +137,7 @@ def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]: def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, Any]: """Returns the input data for inference - This function gets called iteratively in the inference loop. - It can be used to extract relevant input from the prompt tokens, attention mask etc. - required for each step in inference. + This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. """ raise NotImplementedError() @@ -144,16 +183,15 @@ def _get_batch_size_and_seq_len( self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None ): """ - Returns the batch size and sequence length based on the tokens tensor and - recv_buffer_seq_len. + Returns the batch size and sequence length based on the tokens tensor and recv_buffer_seq_len. Args: tokens (torch.Tensor): The input tensor of shape (batch_size, seq_len). recv_buffer_seq_len (int, optional): An optional recv buffer sequence length. Returns: - tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of - tokens and seq_len is either the second dimension or recv_buffer_seq_len. + tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of tokens + and seq_len is either the second dimension or recv_buffer_seq_len. """ batch_size = tokens.shape[0] seq_len = recv_buffer_seq_len if recv_buffer_seq_len is not None else tokens.shape[1] @@ -166,7 +204,7 @@ def _allocate_recv_buffer(self, batch_size, seq_len): # sequence parallelism. Static batching does not support sequence parallelism # except for the MoE layers which is handled separately. seq_len = seq_len // self.tp_size - recv_size = (seq_len, batch_size, self.config.hidden_size) + recv_size = (seq_len, batch_size, self.inference_wrapper_config.hidden_size) return torch.empty( recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device() ) @@ -176,12 +214,10 @@ def forward_pass_without_pipeline_parallel( ) -> torch.Tensor: """Utility to carry out simple forward pass for TP or no model parallel models - Runs a very simple forward pass for model. Used in the case of models without any - parallelism or only tensor parallelism. + Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism. Args: - inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model - [tokens, position ids, attention mask] + inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -192,18 +228,16 @@ def forward_pass_without_pipeline_parallel( return logits - def forward_pass_with_pipeline_parallel( + def forward_pass_with_pipeline_parallel_small_input_batch( self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None ) -> torch.Tensor: - """Utility to carry out forward pass for PP models + """Utility to carry out forward pass for PP models with very small inputs - TODO: Add support for asynchronous microbatches + If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model - [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel - recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -234,8 +268,98 @@ def forward_pass_with_pipeline_parallel( logits = output_tensor # Explicitly cast logits to expected dtype - logits = logits.to(self.config.params_dtype) + logits = logits.to(self.inference_wrapper_config.params_dtype) + + return logits + + def forward_pass_with_pipeline_parallel_large_input_batch( + self, inference_input: Dict[str, Any], recv_buffer_seq_len=None + ) -> torch.Tensor: + """Utility to carry out forward pass PP models. + + Runs the forward pass for models which are pipeline parallel. + This is more complex than forward_pass_with_pipeline_parallel_small_input_batch because + this splits the global batch into small micro batches and runs them through the model. + + Args: + inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] + """ + tokens = inference_input["tokens"] + position_ids = inference_input["position_ids"] + attention_mask = inference_input["attention_mask"] + materialize_only_last_token_logits = ( + self.inference_context.materialize_only_last_token_logits + ) + + micro_batch_size = max( + 1, + self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1), + ) + batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len) + # Round up to account for the last partial micro batch if present + num_micro_batches = math.ceil(batch_size / micro_batch_size) + + logits = None + # Preallocate memory for output logits. + if is_pipeline_last_stage(self.pp_group): + logits_seq_len = 1 if materialize_only_last_token_logits else seq_len + logits = torch.empty( + (batch_size, logits_seq_len, self.inference_wrapper_config.padded_vocab_size), + dtype=self.pipeline_communication_dtype, + device=torch.cuda.current_device(), + ) + + recv_buffer = None + if not is_pipeline_first_stage(self.pp_group): + recv_buffer = self._allocate_recv_buffer(micro_batch_size, seq_len) + for micro_batch_index in range(num_micro_batches): + start = micro_batch_index * micro_batch_size + end = min(start + micro_batch_size, batch_size) + tokens2use = tokens[start:end, ...] + position_ids2use = position_ids[start:end, ...] + current_micro_batch_size = end - start + + # Need to change recv buffer shape for the last partial microbatch (if exists) + if current_micro_batch_size != micro_batch_size: + recv_buffer = self._allocate_recv_buffer(current_micro_batch_size, seq_len) + + if not is_pipeline_first_stage(self.pp_group): + recv_from_prev_pipeline_rank_(recv_buffer, self.pp_group) + + self.model.set_input_tensor(recv_buffer) + + output_tensor = self._forward( + { + "tokens": tokens2use, + "position_ids": position_ids2use, + "attention_mask": attention_mask, + "inference_context": self.inference_context, + } + ) + + if not is_pipeline_last_stage(self.pp_group): + send_to_next_pipeline_rank(output_tensor, self.pp_group) + + self.inference_context.batch_size_offset += current_micro_batch_size + + if is_pipeline_last_stage(self.pp_group): + assert logits is not None + logits[start:end, ...] = output_tensor + + # Explicitly cast logits to expected dtype + if is_pipeline_last_stage(self.pp_group): + assert logits is not None + logits = logits.to(self.inference_wrapper_config.params_dtype) + + # Once done with all micro batches, we reset batch size offset and seq len offset + self.inference_context.increment_sequence_len_offset(seq_len) + self.inference_context.reset_batch_size_offset() + # NOTE: Only returns the logits on the last pipeline stage return logits @torch.inference_mode() @@ -244,18 +368,14 @@ def run_one_forward_step( ) -> torch.Tensor: """The forward pass of the model for inference - Appropriate utility is called for the forward pass depending on the type of model - parallelism used + Appropriate utility is called for the forward pass depending on the type of model parallelism used Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model - [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel - recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. Returns: - torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. - The logits are returned only in the last pipeline stage for PP models. + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models. """ # Check if we are in a PP model if not (is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group)): @@ -263,6 +383,19 @@ def run_one_forward_step( current_batch_size, seq_len = self._get_batch_size_and_seq_len( tokens, recv_buffer_seq_len ) - return self.forward_pass_with_pipeline_parallel(inference_input, recv_buffer_seq_len) + # If input batch is large, we need to split into micro batches and run the forward pass + if ( + current_batch_size * seq_len + > self.inference_wrapper_config.inference_batch_times_seqlen_threshold + and self.inference_wrapper_config.inference_batch_times_seqlen_threshold != -1 + ): + return self.forward_pass_with_pipeline_parallel_large_input_batch( + inference_input, recv_buffer_seq_len + ) + else: + # If input batch is very small we can do a simple forward pass on the entire global batch + return self.forward_pass_with_pipeline_parallel_small_input_batch( + inference_input, recv_buffer_seq_len + ) else: return self.forward_pass_without_pipeline_parallel(inference_input) diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py index 2a1f10daa1e..ba89fbc2f6c 100644 --- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py @@ -7,6 +7,9 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.utils import get_attention_mask from megatron.core.models.gpt import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection @@ -22,6 +25,8 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper): Args: model (GPTModel): The GPT model (MCore or legacy) + inference_wrapper_config (InferenceWrapperConfig): Has info like hidden size, vocab + size, etc. inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. pg_collection (ProcessGroupCollection): Process groups for model communication. @@ -31,13 +36,11 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper): def __init__( self, model: GPTModel, + inference_wrapper_config: InferenceWrapperConfig, inference_context: Optional[BaseInferenceContext] = None, pg_collection: Optional[ProcessGroupCollection] = None, - inference_wrapper_config: Optional[Any] = None, # Deprecated ): - if inference_wrapper_config is not None: - raise TypeError("Passing `inference_wrapper_config` is deprecated.") - super().__init__(model, inference_context, pg_collection) + super().__init__(model, inference_wrapper_config, inference_context, pg_collection) def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]: """Prepares the inference input data. diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py new file mode 100644 index 00000000000..5d89085add2 --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py @@ -0,0 +1,66 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass +from typing import Optional + +import torch + + +@dataclass +class InferenceWrapperConfig: + """Config for the model inference wrapper + + NOTE : All the arguments here are obtained from arguments.py file + """ + + hidden_size: int + """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]""" + + params_dtype: torch.dtype + """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used""" + + inference_batch_times_seqlen_threshold: int + """if (batch-size * sequence-length) is smaller than this threshold then we will not pipeline + the batch.""" + + padded_vocab_size: int + """The final padded vocab size (Padded to make it divisible by + --make-vocab-size-divisible-by value)""" + + inference_max_requests: int = 8 + """ Maximum number of requests for inference (prefill & decode). Necessary for CUDA graphs. """ + + inference_max_seq_length: int = 2560 + """ Maximum sequence length for inference (prefill & decode). Necessary for CUDA graphs. """ + + fp32_residual_connection: bool = False + """Move residual connections to fp32. Obtained from arguments.py""" + + nccl_all_reduce_for_prefill: bool = False + """When using symmetric all reduce kernels we keep the default all reduces for nccl. + This can be more effecient for large prefill sizes""" + + fp8: Optional[str] = None + """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined + choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 + activation and weight tensors and e5m2 for all FP8 output activation gradient tensors.""" + + moe_pad_experts_for_cuda_graph_inference: bool = False + """Some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router + will switch to dropping and padding during decode time which does not have a D2H sync. The + capacity factor is set to the max that an expert could see during inference so no tokens are + actually dropped. """ + + def add_attributes(self, attribute_value_pair: dict): + """Utility to add more attributes to inference params + + Use this method to pass in a custom dictionary to add more configs to the instance created. + Use as follows: + c = InferenceWrapperConfig + c.add_attributes({'precision':'fp32'}) + + Args: + attribute_value_pair (dict): A dictionary containing attributes as the key names and + corresponding values. + """ + for key, value in attribute_value_pair.items(): + setattr(self, key, value) diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py index c773ab507a3..2ae1e2ade6f 100644 --- a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py @@ -11,6 +11,9 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.models.T5 import T5Model from megatron.core.utils import get_attr_wrapped_model @@ -24,6 +27,7 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): Args: model (T5Model): The T5 model (MCore or legacy) + inference_wrapper_config (InferenceWrapperConfig): The command line arguments that were passed inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. use_local (bool): Whether the T5 model's transformer impl @@ -33,10 +37,11 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): def __init__( self, model: T5Model, + inference_wrapper_config: InferenceWrapperConfig, inference_context: Optional[BaseInferenceContext] = None, use_local: bool = False, ): - super().__init__(model, inference_context) + super().__init__(model, inference_wrapper_config, inference_context) self.use_local = use_local def prep_inference_input( diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py new file mode 100644 index 00000000000..340cadb48a9 --- /dev/null +++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import + TextGenerationController as SimpleTextGenerationController, +) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 617883414d4..a5233983ed0 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -11,22 +11,21 @@ import torch import torch.nn.functional as F from torch import Tensor +from torch.distributed import ProcessGroup -from megatron.core import parallel_state from megatron.core.inference.async_stream import AsyncStream from megatron.core.inference.communication_utils import ( broadcast_from_last_pipeline_stage, + is_pipeline_first_stage, is_pipeline_last_stage, ) from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError -from megatron.core.inference.contexts.static_context import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding -from megatron.core.models.multimodal.llava_model import LLaVAModel from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel @@ -53,32 +52,28 @@ class TextGenerationController: inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts + pp_group (ProcessGroup): Process group for pipeline parallelism """ - def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): + def __init__( + self, + inference_wrapped_model: AbstractModelInferenceWrapper, + tokenizer, + pp_group: ProcessGroup = None, + ): self.inference_wrapped_model = inference_wrapped_model - self.model_config = self.inference_wrapped_model.model.config - inference_config = self.inference_wrapped_model.inference_context.config self.tokenizer = tokenizer - pg_collection = inference_config.pg_collection - if pg_collection is not None: - self.pp_group = pg_collection.pp - else: - self.pp_group = parallel_state.get_pipeline_model_parallel_group() - - self.model_is_pipeline_parallel = self.model_config.pipeline_model_parallel_size > 1 + self.pp_group = pp_group - # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. - # TODO(ksanthanam): Consider deprecating this check if LLaVAModel is no longer used - unwrapped_model = unwrap_model(self.inference_wrapped_model.model) - if isinstance(unwrapped_model, LLaVAModel): - self.vocab_size = unwrapped_model.language_model.vocab_size - else: - self.vocab_size = unwrapped_model.vocab_size + # For models without pipeline parallelism, is_first_stage and is_last_stage returns True + self.model_is_pipeline_parallel = not ( + is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group) + ) + model_config = get_model_config(self.inference_wrapped_model.model) self.sampling_rng = torch.Generator(device=torch.cuda.current_device()) - self.sampling_rng.manual_seed(self.model_config.inference_sampling_seed) + self.sampling_rng.manual_seed(model_config.inference_sampling_seed) if self.inference_wrapped_model.inference_context.is_dynamic_batching(): self._init_dynamic_sampling_tensors() @@ -103,7 +98,9 @@ def _init_dynamic_sampling_tensors(self): self._get_stop_word_finished_ids_callback = None device = torch.cuda.current_device() - logits_dtype = self.inference_wrapped_model.config.params_dtype + logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype + # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. + vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size self._sampling_backend = "torch" self._sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) @@ -508,6 +505,7 @@ def _dynamic_step_context_init( position_ids (Tensor): The active position IDs. """ context = self.inference_wrapped_model.inference_context + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config active_request_slice = slice(context.paused_request_count, context.total_request_count) # Remove Float16Module wrapper if it exists @@ -519,11 +517,11 @@ def _dynamic_step_context_init( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = self.model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill + symmetric_ar_type = model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill # Turning on/off MoE padding for cuda-graphs moe_pad_experts_for_cuda_graph_inference = ( - self.model_config.moe_pad_experts_for_cuda_graph_inference + inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: if context.using_cuda_graph_this_step(): @@ -571,6 +569,8 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) input_ids (Tensor): The input token IDs. position_ids (Tensor): The position IDs. """ + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config + context = self.inference_wrapped_model.inference_context active_request_count = context.total_request_count - context.paused_request_count @@ -582,17 +582,18 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) if self.model_is_pipeline_parallel: logits_seq_len = ( active_request_count - if context.config.materialize_only_last_token_logits + if context.materialize_only_last_token_logits else input_ids.shape[1] ) - logits_shape = [1, logits_seq_len, self.vocab_size] + vocab_size = inference_wrapper_config.padded_vocab_size + logits_shape = [1, logits_seq_len, vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape logits = broadcast_from_last_pipeline_stage( logits_shape, - dtype=self.model_config.params_dtype, + dtype=inference_wrapper_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -638,7 +639,7 @@ def _dynamic_step_sample_logits(self, logits: Tensor): # Last token logits. context = self.inference_wrapped_model.inference_context - if context.config.materialize_only_last_token_logits: + if context.materialize_only_last_token_logits: # When materialize_only_last_token_logits is true, last_token_logits is # already called in the forward pass of GPT. last_token_logits = logits.squeeze(0) @@ -683,7 +684,7 @@ def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]: return context.calculate_log_probs( logits, self._sampled_tokens_cuda[:active_request_count], - only_last_token_logits=context.config.materialize_only_last_token_logits, + only_last_token_logits=context.materialize_only_last_token_logits, ) def _dynamic_step_calculate_top_n_logprobs( @@ -711,7 +712,7 @@ def _dynamic_step_calculate_top_n_logprobs( active_request_slice = slice(context.paused_request_count, context.total_request_count) # Handle decode-only mode (only last token) - if context.config.materialize_only_last_token_logits or context.is_decode_only(): + if context.materialize_only_last_token_logits or context.is_decode_only(): # In decode mode or when only last token logits are materialized, # logits already represent only the last tokens log_probs = log_probs_tensor[:active_request_count] @@ -1023,10 +1024,9 @@ def generate_all_output_tokens_static_batch( # Pad batch tokens if necessary batch_size = len(active_requests) max_sequence_length = max_prompt_length_in_batch + sampling_params.num_tokens_to_generate - context = self.inference_wrapped_model.inference_context - assert isinstance(context, StaticInferenceContext) - inference_max_batch_size = context.max_batch_size - inference_max_sequence_length = context.max_sequence_length + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config + inference_max_batch_size = inference_wrapper_config.inference_max_requests + inference_max_sequence_length = inference_wrapper_config.inference_max_seq_length padded_batch_size = inference_max_batch_size if enable_cuda_graph else batch_size if padded_batch_size > inference_max_batch_size: raise ValueError( @@ -1066,6 +1066,10 @@ def generate_all_output_tokens_static_batch( batch_size, device=torch.cuda.current_device() ).cuda() + # Use padded vocab size because tokenizer vocab size might not include padding + # to nearest power of 2 + vocab_size = inference_wrapper_config.padded_vocab_size + # Check whether early termination is enabled no_early_termination = getattr(sampling_params, "no_early_termination", False) termination_id = -1 if no_early_termination else self.tokenizer.eod @@ -1126,14 +1130,14 @@ def generate_all_output_tokens_static_batch( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = self.model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill + symmetric_ar_type = model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill if symmetric_ar_type is not None and nccl_all_reduce_for_prefill: unwrapped_model.set_symmetric_ar(None) # Turning off MoE padding for prefill moe_pad_experts_for_cuda_graph_inference = ( - self.model_config.moe_pad_experts_for_cuda_graph_inference + inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: set_decode_expert_padding(unwrapped_model, False) @@ -1187,7 +1191,7 @@ def generate_all_output_tokens_static_batch( or not (sampling_params.return_log_probs or sampling_params.top_n_logprobs > 0) ) inference_context = self.inference_wrapped_model.inference_context - inference_context.config.materialize_only_last_token_logits = ( + inference_context.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -1208,14 +1212,14 @@ def generate_all_output_tokens_static_batch( if self.model_is_pipeline_parallel: context_length = context_end_position - context_start_position logits_seq_len = 1 if materialize_only_last_token_logits else context_length - logits_shape = [batch_size, logits_seq_len, self.vocab_size] + logits_shape = [batch_size, logits_seq_len, vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank # and then broadcast the sampled tokens rather than broadcasting the raw logits. logits = broadcast_from_last_pipeline_stage( - [batch_size, logits_seq_len, self.vocab_size], - dtype=self.model_config.params_dtype, + [batch_size, logits_seq_len, vocab_size], + dtype=inference_wrapper_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -1244,7 +1248,7 @@ def generate_all_output_tokens_static_batch( sampled_logits = self.sample_from_logits( last_token_logits, sampling_params, - self.vocab_size, + vocab_size, generation_started=generation_started, top_n_logprobs_dict=top_n_logprobs_dict, logits=logits_for_top_n_prompt_logprobs, diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index f44aed613e7..e287344c13d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -661,7 +661,7 @@ def _postprocess( ) sequence_parallel_override = False - if in_inference_mode and inference_context.config.materialize_only_last_token_logits: + if in_inference_mode and inference_context.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -691,7 +691,7 @@ def _postprocess( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.config.materialize_only_last_token_logits + and inference_context.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 6d43f5583df..8d45e1d0147 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -267,7 +267,7 @@ def forward( output_weight = self.shared_embedding_or_output_weight() sequence_parallel_override = False - if in_inference_mode and inference_context.config.materialize_only_last_token_logits: + if in_inference_mode and inference_context.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -297,7 +297,7 @@ def forward( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.config.materialize_only_last_token_logits + and inference_context.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index 0b4ef42457d..ac6e8b5bf40 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -193,9 +193,6 @@ def _should_call_local_cudagraph(self, *args, **kwargs): and kwargs.get('attention_mask') is None and kwargs.get('inference_context') is not None ): - context = kwargs['inference_context'] - using_cuda_graph = (context.is_static_batching() and context.is_decode_only()) or ( - not context.is_static_batching() and context.using_cuda_graph_this_step() - ) + using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step() return using_cuda_graph return False diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 48b04c35134..eaae585905e 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -696,12 +696,6 @@ class TransformerConfig(ModelParallelConfig): the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False.""" - moe_pad_experts_for_cuda_graph_inference: bool = False - """moe_pad_experts_for_cuda_graph_inference (bool): If True, the router will switch to dropping - and padding during decode time which does not have a D2H sync. The capacity factor is set to the - max that an expert could see during inference so no tokens are actually dropped. The default - setting is False.""" - moe_token_drop_policy: Literal['probs', 'position'] = "probs" """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will @@ -836,9 +830,6 @@ class TransformerConfig(ModelParallelConfig): which is no use of symmetric memory. """ - nccl_all_reduce_for_prefill: bool = False - """If True, use NCCL all-reduce kernels when symmetric all-reduce is enabled.""" - use_inference_optimized_layers: bool = False """If True, use inference optimized transformer layers during inference.""" diff --git a/megatron/core/utils.py b/megatron/core/utils.py index cb2f7d34128..d7b702f25ec 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -496,6 +496,17 @@ def divide(numerator, denominator): return numerator // denominator +def deprecate_inference_params(inference_context, inference_params): + """Print warning for deprecated `inference_params`.""" + if inference_context is None and inference_params is not None: + warnings.warn( + "`inference_params` renamed to `inference_context`, and will be " + "removed in `megatron-core` 0.13." + ) + return inference_params + return inference_context + + def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_initialized=True): """Issue a deprecation warning if tp_group is None and return the default tp group.""" # TODO(zijiey): remove this function later. @@ -2394,6 +2405,25 @@ async def wrapper(*args, **kwargs): return _decorate if func is None else _decorate(func) +def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]: + """Returns Mamba inference state config from the model if it is a hybrid model.""" + from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, + ) + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols + + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() + return MambaInferenceStateConfig( + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, + ) + return None + + # ============================================================================ # Backward Compatibility Decorators # ============================================================================ @@ -2528,43 +2558,3 @@ class ExperimentalModel: """ func._experimental_api = True return func - - -def deprecate_args( - *deprecated_keys, message="Argument '{name}' has been deprecated and should not be used." -): - """ - Intercepts specific keyword arguments to raise a custom TypeError. - - Args: - *deprecated_keys: Strings representing the argument names to block. - message: Custom error message string. Use {name} as a placeholder. - """ - - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - # Check if any deprecated key is present in kwargs - found_deprecated = set(deprecated_keys) & set(kwargs.keys()) - - if found_deprecated: - bad_key = list(found_deprecated)[0] - raise TypeError(message.format(name=bad_key)) - - # Send args to the real function - return func(*args, **kwargs) - - return wrapper - - return decorator - - -def deprecate_inference_params(inference_context, inference_params): - """Print warning for deprecated `inference_params`.""" - if inference_context is None and inference_params is not None: - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be " - "removed in `megatron-core` 0.13." - ) - return inference_params - return inference_context diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py deleted file mode 100644 index 26496bfed70..00000000000 --- a/megatron/inference/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py deleted file mode 100644 index 145af726c4f..00000000000 --- a/megatron/inference/utils.py +++ /dev/null @@ -1,320 +0,0 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -import logging -from argparse import ArgumentParser -from functools import partial -from typing import Optional - -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder -from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig -from megatron.core.inference.contexts import DynamicInferenceContext -from megatron.core.inference.engines import DynamicInferenceEngine -from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( - GPTInferenceWrapper, -) -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( - TextGenerationController, -) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_attr_wrapped_model, log_single_rank -from megatron.training import get_args -from megatron.training import get_model as _get_model -from megatron.training import get_tokenizer, get_wandb_writer -from megatron.training.checkpointing import load_checkpoint -from model_provider import model_provider - -logger = logging.getLogger(__name__) - - -def get_model_for_inference() -> MegatronModule: - """Initialize model and load checkpoint for inference.""" - - args = get_args() - - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - - # Build model. - model = _get_model(partial(model_provider, model_builder), wrap_with_ddp=False) - - # Load checkpoint. - assert args.load is not None - args.exit_on_missing_checkpoint = True - load_checkpoint( - ddp_model=model, - optimizer=None, - opt_param_scheduler=None, - strict=not args.inference_ckpt_non_strict, - ) - - # No virtual PP. - assert len(model) == 1, "Above condition should have caught this" - model = model[0] - - # Eval mode. - model.eval() - - return model - - -def add_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Add inference command line arguments to the parser.""" - - group = parser.add_argument_group(title='Inference') - - group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') - group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') - group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') - group.add_argument( - "--return-log-probs", - action='store_true', - default=False, - help='Return the log probabilities of the final output tokens', - ) - group.add_argument( - "--prompts", - metavar='N', - type=str, - nargs='+', - help='Input prompts with each prompt within quotes and seperated by space', - ) - group.add_argument( - "--num-tokens-to-prompt", - type=int, - nargs="+", - default=[64, 1024], - help='Number of tokens to use for simulated prompts. This should be a ' - 'space-separated pair of integers, and the generated prompt lengths will ' - 'be uniformly sampled within this range.', - ) - group.add_argument( - "--num-tokens-to-generate", - type=int, - default=30, - help='Number of tokens to generate for each prompt', - ) - group.add_argument( - "--num-tokens-from-file", - action='store_true', - default=False, - help='Use per-prompt num_tokens_to_generate from prompt file', - ) - group.add_argument( - "--top-n-logprobs", - type=int, - default=0, - help=( - "Return the top n logprobs for the generated tokens and their " - "corresponding token as a dictionary" - ), - ) - group.add_argument( - "--incoming-requests-per-step", - type=int, - default=None, - help="Add a deterministic number of requests per step. This arg is " - "prioritized over `--incoming-requests-per-sec` below (which is non-" - "deterministic). Note that the number of requests added per step is " - "additionally limited by the inference context's `max_requests`, " - "`max_tokens`, and KV buffer size.", - ) - group.add_argument( - "--incoming-requests-per-sec", - type=float, - default=100.0, - help="Simulated number of requests per second. Set to -1 to add all requests together.", - ) - group.add_argument( - "--incoming-requests-duration", - type=float, - default=10.0, - help="Total amount of time to simulate that requests are " - "arriving. Multiply this value with " - "`--incoming-requests-per-sec` to get the approximate " - "total number of requests. Set to -1 to add all requests together.", - ) - group.add_argument( - "--model-provider", choices=["mamba", "gpt"], default="gpt", help="Model provider" - ) - group.add_argument( - "--skip-prompt-log-probs", action='store_true', default=False, help='Skip prompt log probs.' - ) - group.add_argument( - "--stop-words", - metavar='WORD', - type=str, - nargs='+', - default=None, - help='Stop words to terminate generation. Each word should be quoted and ' - 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', - ) - group.add_argument( - "--output-path", type=str, default=None, help="Path to save generations as JSON" - ) - group.add_argument( - "--output-every-n-results", - type=int, - default=1, - help="To minimize the output file size of larger runs, only write the " - "results of every `n` requests.", - ) - group.add_argument( - "--prompt-file", - help='Jsonl file containing input prompts, where each item (i.e., line) ' - 'contains the field \'text\' where the value is the prompt. All other ' - 'fields within each item are ignored, and may be customized for each ' - 'application.', - ) - group.add_argument( - "--prompt-file-num-truncate", - type=int, - help='Number of samples to use from the loaded prompt file (see ' - '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' - 'will be used, in order.', - ) - group.add_argument( - "--use-flashinfer-fused-rope", - action='store_true', - default=False, - help='Use flashinfer fused rope implementation.', - ) - group.add_argument( - "--no-record-throughput", - action='store_false', - dest="record_throughput", - help="Disable throughput recording in --output-file", - ) - group.add_argument( - "--inference-ckpt-non-strict", - action="store_true", - help="Load checkpoint with `strict=False`.", - ) - group.add_argument( - "--termination-id", - type=int, - default=None, - help="Termination ID that overrides `tokenizer.eod`.", - ) - group.add_argument( - "--suspend-resume-interval", - type=int, - default=None, - help="Suspend and resume the dynamic engine every " - "`suspend_resume_interval` steps. This is used to tet the suspend/resume " - "system.", - ) - group.add_argument( - "--inference-repeat-n", - type=int, - default=1, - help="Repeat inference iterations N times for benchmarking.", - ) - group.add_argument( - "--throughput-check-only", - action='store_true', - default=False, - help="If true, only run throughput check without verifying outputs.", - ) - - return parser - - -def get_inference_config_from_model_and_args(model: MegatronModule, args): - """Returns a `InferenceConfig` constructed from the model and command line arguments.""" - - # Max sequence length. - position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") - model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") - inf_max_seq_len = args.inference_max_seq_length - max_batch_size = args.inference_dynamic_batching_max_requests - - if position_embedding_type == "learned_absolute": - # When using absolute position embeddings, it is critical that the - # context's `max_sequence_length` is less than or equal to the model's - # `max_sequence_length`. Otherwise, the context's `position_ids` will - # contain ids greater than the dimension of the position embedding - # tensor, which will result in an index error. - if inf_max_seq_len: - max_sequence_length = min(model_max_seq_len, inf_max_seq_len) - else: - max_sequence_length = model_max_seq_len - assert max_batch_size is None or max_batch_size <= model_max_seq_len - else: - max_sequence_length = inf_max_seq_len - if args.inference_dynamic_batching_max_requests is not None: - max_sequence_length = max(max_sequence_length, max_batch_size) - - mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) - pg_collection = get_attr_wrapped_model(model, "pg_collection") - - # Get inference logging configuration from args - log_inference_wandb = args.inference_wandb_logging - inference_logging_step_interval = args.inference_logging_step_interval - - # Get metrics writer if logging is enabled and on the logging rank - # Use the same rank convention as training (last rank logs) - metrics_writer = None - if ( - inference_logging_step_interval > 0 - and log_inference_wandb - and args.rank == (args.world_size - 1) - ): - metrics_writer = get_wandb_writer() - if metrics_writer is None: - log_single_rank( - logger, - logging.WARNING, - "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " - "wandb module is available. Inference logging will be disabled.", - ) - - return InferenceConfig( - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - offload_kv_cache=args.rl_offload_kv_cache_during_training, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, # pylint: disable=line-too-long - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - persist_cuda_graphs=args.rl_training_cuda_graphs, - max_sequence_length=max_sequence_length, - mamba_inference_state_config=mamba_inference_state_config, - pg_collection=pg_collection, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - materialize_only_last_token_logits=not args.return_log_probs, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=args.enable_chunked_prefill, - metrics_writer=metrics_writer, - logging_step_interval=args.inference_logging_step_interval, - ) - - -def get_dynamic_inference_engine(model: Optional[MegatronModule] = None) -> DynamicInferenceEngine: - """Builds a `DynamicInferenceEngine`.""" - args = get_args() - if model is None: - model = get_model_for_inference() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - inference_config = get_inference_config_from_model_and_args(model, args) - context = DynamicInferenceContext(model.config, inference_config) - inference_wrapped_model = GPTInferenceWrapper(model, context) - controller = TextGenerationController(inference_wrapped_model, tokenizer) - engine = DynamicInferenceEngine(controller, context) - return engine diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py index 602ff4f7450..4e9364b3ae9 100644 --- a/megatron/rl/inference/megatron.py +++ b/megatron/rl/inference/megatron.py @@ -7,6 +7,7 @@ import torch.distributed as dist from pydantic import PrivateAttr +from megatron.core import parallel_state from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine @@ -15,13 +16,23 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( - TextGenerationController, +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( + SimpleTextGenerationController, ) from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_attr_wrapped_model, log_single_rank +from megatron.core.utils import ( + get_attr_wrapped_model, + get_mamba_inference_state_config_from_model, + get_pg_size, + log_single_rank, +) from megatron.training import get_wandb_writer from megatron.training.global_vars import get_args, get_tokenizer @@ -55,20 +66,134 @@ def get_static_inference_engine(args: Namespace, model: MegatronModule) -> Abstr """ tokenizer = get_tokenizer() - inference_wrapped_model = GPTInferenceWrapper(model) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + inference_max_seq_length=args.inference_max_seq_length, + inference_max_requests=( + args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 + ), + nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, + ) + + inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) pg_collection = get_attr_wrapped_model(model, "pg_collection") pp_group = pg_collection.pp - text_generation_controller = TextGenerationController( - inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer, pp_group=pp_group + text_generation_controller = SimpleTextGenerationController( + inference_wrapped_model=inference_wrapped_model, + tokenizer=tokenizer, + pp_group=pp_group, ) return MCoreEngine( text_generation_controller=text_generation_controller, max_batch_size=( - args.inference_max_requests if args.inference_max_requests is not None else 1 + args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 ), ) +## This code is copied from tools/run_text_generation_server.py +def get_dynamic_inference_engine( + args: Namespace, + model: MegatronModule, + inference_logging_step_interval: int = 0, + metrics_writer = None +) -> AbstractEngine: + """Get the relevant backend for running inference. + + This function will automatically choose the TRTLLMBackend when possible, + and default to Mcore backend if the user does not specify any backends. + TRTLLMBackend is not implmented yet. + + Args: + args (Namespace): The user arguments parsed from command line + model (MegatronModule): The megatron model. + inference_logging_step_interval (int): Step interval for logging inference metrics. + metrics_writer: Metrics writer (wandb module) for logging. + + Returns: + AbstractBackend: The chosen backend + """ + tokenizer = get_tokenizer() + + enable_cuda_graph = args.cuda_graph_impl == "local" + + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + + # DynamicInferenceContext must use the inference model's TP / PP size, not the + # training TP / PP size from global args. The inference model may have a custom + # ProcessGroupCollection with a different TP / PP size. + pg_collection = get_attr_wrapped_model(model, "pg_collection") + tp_group = getattr(pg_collection, 'tp', None) if pg_collection is not None else None + if tp_group is not None: + inference_tp_size = get_pg_size(tp_group) + else: + inference_tp_size = args.tensor_model_parallel_size + pp_group = getattr(pg_collection, 'pp', None) if pg_collection is not None else None + if pp_group is not None: + inference_pp_size = get_pg_size(pp_group) + else: + inference_pp_size = args.pipeline_model_parallel_size + + # Inference context. + inference_context = DynamicInferenceContext( + params_dtype=args.params_dtype, + num_layers=args.num_layers // inference_pp_size, + kv_channels=args.kv_channels, + num_attention_heads=( + args.num_query_groups if args.group_query_attention else args.num_attention_heads + ), + max_sequence_length=args.inference_max_seq_length, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs if enable_cuda_graph else None + ), + block_size_tokens=args.inference_dynamic_batching_block_size, + buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, + max_requests=args.inference_dynamic_batching_max_requests, + max_tokens=args.inference_dynamic_batching_max_tokens, + pg_collection=pg_collection, # TP/PP sizes are derived from the model's pg_collection. + materialize_only_last_token_logits=True, + mamba_inference_state_config=mamba_inference_state_config, + cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, + kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, + qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + use_flashinfer_fused_rope=None, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, + cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, + metrics_writer=metrics_writer, + persist_cuda_graphs=args.rl_training_cuda_graphs, + offload_kv_cache=args.rl_offload_kv_cache_during_training + ) + + inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context, pg_collection=pg_collection) + + inference_wrapped_model.model_is_pipeline_parallel = not ( + is_pp_first_stage(pg_collection.pp) and is_pp_last_stage(pg_collection.pp) + ) + + pp_group = getattr(pg_collection, "pp", None) + text_generation_controller = SimpleTextGenerationController( + inference_wrapped_model=inference_wrapped_model, + tokenizer=tokenizer, + pp_group=pp_group, + ) + + return DynamicInferenceEngine( + controller=text_generation_controller, + context=inference_context, + random_seed=args.seed, + track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, + enable_chunked_prefill=not args.disable_chunked_prefill, + inference_logging_step_interval=inference_logging_step_interval, + pg_collection=pg_collection, + ) + + class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): """Interface to use MCoreEngine directly as an inference engine.""" @@ -121,9 +246,6 @@ async def base_generate(self, request: InferenceRequest): @classmethod async def launch(cls, model: GPTModel, **kwargs): - # Import here to avoid circular imports - from megatron.inference.utils import get_dynamic_inference_engine - args = get_args() tokenizer = get_tokenizer() @@ -134,7 +256,30 @@ async def launch(cls, model: GPTModel, **kwargs): "WARNING: Tokenizer has no BOS token so prompt will not have BOS token", ) - inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(model=model) + # Get inference logging configuration from args + log_inference_wandb = args.inference_wandb_logging + inference_logging_step_interval = args.inference_logging_step_interval + + # Get metrics writer if logging is enabled and on the logging rank + # Use the same rank convention as training (last rank logs) + metrics_writer = None + if ( + inference_logging_step_interval > 0 + and log_inference_wandb + and args.rank == (args.world_size - 1) + ): + metrics_writer = get_wandb_writer() + if metrics_writer is None: + log_single_rank( + logger, + logging.WARNING, + "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " + "wandb module is available. Inference logging will be disabled.", + ) + + inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine( + args, model, inference_logging_step_interval, metrics_writer + ) dp_addr = await inference_engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=41521, launch_inference_coordinator=True, ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index f56bc6c5e2f..46f3c28b1da 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1446,10 +1446,13 @@ def _add_inference_args(parser): dest='use_legacy_static_engine') group.add_argument('--inference-max-requests', type=int, default=8, help='Maximum number of requests for inference.', - dest='inference_max_requests') + dest='inference_max_batch_size') group.add_argument('--inference-max-seq-length', type=int, default=2560, help='Maximum sequence length expected for inference (prefill + decode).', dest='inference_max_seq_length') + group.add_argument('--inference-max-batch-size', type=int, default=None, + help='Maximum batch size for inference.', + dest='inference_max_batch_size') group.add_argument('--inference-dynamic-batching', action='store_true', default=False, help='Enable dynamic batching mode.') @@ -1505,10 +1508,15 @@ def _add_inference_args(parser): '1) allocate `memory_buffer` in unified memory. ' 'Eventually, additional levels will be included to ' 'control other tensors within the context.') - # TODO(ksanthanam): Clean this up in future PR - group.add_argument('--enable-chunked-prefill', dest='enable_chunked_prefill', + group.add_argument('--nccl-all-reduce-for-prefill', action='store_true', default=False, + help='When using symmeric all reduce kernels this will use regular nccl kernels for prefill. This can be more effecient when prefill is large as the nccl kernels can be more bandwith optimized') + # TODO(ksanthanam): Clean this up in future PR + group.add_argument('--enable-chunked-prefill', dest='disable_chunked_prefill', + action='store_false', default=True, help="Enable chunked prefill (disabled by default)") + group.add_argument('--disable-chunked-prefill', dest='disable_chunked_prefill', + action='store_true', help=argparse.SUPPRESS) group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens', type=int, default=16384, help='Maximum number of tokens to capture in a cuda graph.') @@ -2706,6 +2714,10 @@ def _add_moe_args(parser): group.add_argument('--moe-upcycling-granularity', type=int, default=1, help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. ' 'For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.') + group.add_argument('--moe-pad-experts-for-cuda-graph-inference', action='store_true', + help="some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router will switch" \ + " to dropping and padding during decode time which does not have a D2H sync. The capacity factor is set to the" \ + " max that an expert could see during inference so no tokens are actually dropped.") return parser def _add_mla_args(parser): diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index 1c78b466b1e..be00e4b3ce7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -44,7 +44,6 @@ MODEL_ARGS: --flash-decode: true --dist-ckpt-strictness: log_unexpected --output-path: ${INFERENCE_OUTPUT_PATH} - --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. METRICS: diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index f3ef0910f58..05e0306bfd8 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -1,13 +1,14 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -import contextlib import math import pytest import torch from megatron.core import parallel_state -from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, +) from megatron.core.inference.contexts.dynamic_context import ( DynamicInferenceContext, RequestOverflowError, @@ -17,21 +18,14 @@ from megatron.core.inference.sampling_params import SamplingParams from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed -from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils -@contextlib.contextmanager -def rounder_override(n): - original_token_rounder = DynamicInferenceContext.TOKEN_ROUNDER - original_request_rounder = DynamicInferenceContext.REQUEST_ROUNDER - try: - DynamicInferenceContext.TOKEN_ROUNDER = n - DynamicInferenceContext.REQUEST_ROUNDER = n - yield - finally: - DynamicInferenceContext.TOKEN_ROUNDER = original_token_rounder - DynamicInferenceContext.REQUEST_ROUNDER = original_request_rounder +def set_rounder(value): + """Utility function to set the DynamicInferenceContext rounder.""" + DynamicInferenceContext.ROUNDER = value # For backwards compatibility + DynamicInferenceContext.TOKEN_ROUNDER = value + DynamicInferenceContext.REQUEST_ROUNDER = value class TestDynamicContext: @@ -58,8 +52,11 @@ def _get_dynamic_context( max_tokens, is_hybrid_model=False, layer_type_list=None, + rounder=64, paused_buffer_size_gb=None, ): + set_rounder(rounder) + if is_hybrid_model: if layer_type_list is None: layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] @@ -72,27 +69,23 @@ def _get_dynamic_context( mamba_inference_state_config = None dynamic_context = DynamicInferenceContext( - model_config=TransformerConfig( - params_dtype=params_dtype, - num_layers=num_layers, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - ), - inference_config=InferenceConfig( - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=buffer_size_gb, - paused_buffer_size_gb=( - 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb - ), - block_size_tokens=block_size_tokens, - max_tokens=max_tokens, - mamba_inference_state_config=mamba_inference_state_config, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM + params_dtype=params_dtype, + num_layers=num_layers // self.pp_size, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=buffer_size_gb, + paused_buffer_size_gb=( + 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb ), + block_size_tokens=block_size_tokens, + max_tokens=max_tokens, + mamba_inference_state_config=mamba_inference_state_config, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) return dynamic_context @@ -100,7 +93,6 @@ def teardown_method(self, method): Utils.destroy_model_parallel() @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_initialize_dynamic_context(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -115,6 +107,7 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool): block_size_tokens=128, max_tokens=None, is_hybrid_model=is_hybrid_model, + rounder=64, ) if not is_hybrid_model: @@ -152,7 +145,6 @@ def test_is_static_batching(self): assert not dynamic_context.is_static_batching() @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_is_memory_available(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -176,7 +168,6 @@ def test_is_memory_available(self, is_hybrid_model): assert not dynamic_context.block_allocator.is_memory_available(1) @pytest.mark.internal - @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_request_overflow(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -190,6 +181,7 @@ def test_request_overflow(self, is_hybrid_model: bool): buffer_size_gb=0.01, block_size_tokens=32, max_tokens=None, + rounder=1, is_hybrid_model=is_hybrid_model, ) dynamic_context.max_requests //= 2 @@ -206,7 +198,6 @@ def test_request_overflow(self, is_hybrid_model: bool): ) # Adding more than allowed requests @pytest.mark.internal - @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_token_overflow_error(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -220,6 +211,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool): buffer_size_gb=0.1, block_size_tokens=128, max_tokens=200, # setting low, but >= context.max_requests. + rounder=1, is_hybrid_model=is_hybrid_model, ) @@ -235,7 +227,6 @@ def test_token_overflow_error(self, is_hybrid_model: bool): ) # Exceeding max token count @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_reset(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -310,7 +301,6 @@ def test_reset(self, is_hybrid_model: bool): assert torch.all(dynamic_context.mamba_metadata.request_to_mamba_state_idx == -1) @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_allocate_and_release_memory_blocks(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -359,7 +349,6 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): ) @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_add_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -433,7 +422,6 @@ def test_add_request(self, is_hybrid_model: bool): ) @pytest.mark.internal - @rounder_override(64) def test_add_dummy_requests_parallel_populates_state(self): self._setup_model_parallel_group(1, 1) @@ -532,7 +520,6 @@ def test_add_dummy_requests_parallel_populates_state(self): ) @pytest.mark.internal - @rounder_override(64) def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): self._setup_model_parallel_group(1, 1) @@ -563,7 +550,6 @@ def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): assert torch.all(dynamic_context.mamba_ssm_states[:, mamba_idx] == 0) @pytest.mark.internal - @rounder_override(64) def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): self._setup_model_parallel_group(1, 1) @@ -589,7 +575,6 @@ def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): assert dynamic_context.num_prefill_requests == 0 @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_update_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -789,7 +774,6 @@ def test_update_request(self, is_hybrid_model: bool): ) @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): """Test that memory blocks are correctly released for finished requests.""" @@ -862,7 +846,6 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): assert mamba_idx[4] == -1 @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): """Test that all memory blocks are correctly released for finished requests that use multiple blocks.""" @@ -930,7 +913,6 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 6 @pytest.mark.internal - @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_mamba_states_cache(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -1006,7 +988,6 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): assert torch.all(ssm_state_layer3 == 40.0) @pytest.mark.internal - @rounder_override(64) def test_calculate_and_store_log_probs(self): self._setup_model_parallel_group(1, 1) dynamic_context = self._get_dynamic_context( @@ -1224,7 +1205,6 @@ def test_calculate_and_store_log_probs(self): current_global_token_offset += expected_len @pytest.mark.internal - @rounder_override(64) def test_pipeline_parallel_uneven_layers(self): """ Test that DynamicInferenceContext synchronizes the total block count across @@ -1235,39 +1215,23 @@ def test_pipeline_parallel_uneven_layers(self): rank = parallel_state.get_pipeline_model_parallel_rank() - mamba_conv_states_shape = (544, 4) - mamba_ssm_states_shape = (8, 64, 16) - if rank == 0: - mamba_inference_state_config = MambaInferenceStateConfig( - [Symbols.MAMBA] + [Symbols.ATTENTION] * 4, - mamba_conv_states_shape, - mamba_ssm_states_shape, - ) + local_num_layers = 12 else: - mamba_inference_state_config = MambaInferenceStateConfig( - [Symbols.MAMBA] * 4 + [Symbols.ATTENTION], - mamba_conv_states_shape, - mamba_ssm_states_shape, - ) + local_num_layers = 4 context = DynamicInferenceContext( - model_config=TransformerConfig( - params_dtype=torch.float32, - num_layers=10, - kv_channels=64, - num_attention_heads=8, - pipeline_model_parallel_size=pp_size, - tensor_model_parallel_size=1, - pipeline_dtype=torch.float32, - ), - inference_config=InferenceConfig( - max_sequence_length=128, - buffer_size_gb=0.1, - block_size_tokens=16, - max_tokens=1024, - unified_memory_level=0, - ), + params_dtype=torch.float32, + num_layers=local_num_layers, + kv_channels=64, + num_attention_heads=8, + max_sequence_length=128, + buffer_size_gb=0.1, + block_size_tokens=16, + max_tokens=1024, + pipeline_model_parallel_size=pp_size, + tensor_model_parallel_size=1, + unified_memory_level=0, ) # Collect the total block counts on each rank diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 2e935cab4bd..d5803b3638e 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -13,7 +13,9 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, +) from megatron.core.inference.contexts.dynamic_context import ( ActiveRequestCountOverflowError, BlockOverflowError, @@ -26,6 +28,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -43,7 +48,11 @@ from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import is_fa_min_version, is_te_min_version +from megatron.core.utils import ( + get_mamba_inference_state_config_from_model, + is_fa_min_version, + is_te_min_version, +) from tests.unit_tests.test_utilities import Utils @@ -214,22 +223,26 @@ def _build_inference_context( # Inference context. context = DynamicInferenceContext( - model_config=transformer_config, - inference_config=InferenceConfig( - max_sequence_length=test_config.max_sequence_length, - num_cuda_graphs=test_config.num_cuda_graphs, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=test_config.context_buffer_size_gb, - paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, - block_size_tokens=test_config.context_block_size_tokens, - max_requests=test_config.context_max_requests, - max_tokens=test_config.context_max_tokens, - mamba_inference_state_config=mamba_inference_state_config, - materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM - ), + params_dtype=transformer_config.params_dtype, + num_layers=transformer_config.num_layers + // transformer_config.pipeline_model_parallel_size, + kv_channels=transformer_config.kv_channels, + num_attention_heads=transformer_config.num_query_groups, + max_sequence_length=test_config.max_sequence_length, + num_cuda_graphs=test_config.num_cuda_graphs, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=test_config.context_buffer_size_gb, + paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, + block_size_tokens=test_config.context_block_size_tokens, + max_requests=test_config.context_max_requests, + max_tokens=test_config.context_max_tokens, + tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, + pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, + mamba_inference_state_config=mamba_inference_state_config, + materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) return context @@ -369,7 +382,17 @@ def _build_test_env(cls, test_config): model.eval() - mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + + # Inference config. + inference_config = InferenceWrapperConfig( + hidden_size=transformer_config.hidden_size, + inference_batch_times_seqlen_threshold=400, + fp32_residual_connection=False, + params_dtype=transformer_config.params_dtype, + fp8=transformer_config.fp8, + padded_vocab_size=test_config.vocab_size, + ) # Inference context. inference_context = cls._build_inference_context( @@ -380,7 +403,7 @@ def _build_test_env(cls, test_config): ) # Inference model wrapper. - inference_wrapped_model = GPTInferenceWrapper(model, inference_context) + inference_wrapped_model = GPTInferenceWrapper(model, inference_config, inference_context) # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). inference_wrapped_model.model_is_pipeline_parallel = not ( @@ -401,7 +424,13 @@ def _build_test_env(cls, test_config): CudaGraphManager.global_mempool = None # Inference engine. - engine = DynamicInferenceEngine(text_generation_controller, inference_context) + engine = DynamicInferenceEngine( + text_generation_controller, + inference_context, + random_seed=test_config.random_seed, + enable_cuda_graph=transformer_config.cuda_graph_impl == "local", + enable_chunked_prefill=test_config.enable_chunked_prefill, + ) # Test env. env = DynamicEngineTestEnv(config=test_config, requests=requests, engine=engine) diff --git a/tests/unit_tests/inference/engines/test_static_engine.py b/tests/unit_tests/inference/engines/test_static_engine.py index 483a21d13bd..03b3712e39a 100644 --- a/tests/unit_tests/inference/engines/test_static_engine.py +++ b/tests/unit_tests/inference/engines/test_static_engine.py @@ -20,6 +20,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -82,11 +85,20 @@ def setup_engine( ).cuda() gpt_model.to(inference_config_params_dtype) - inference_context = StaticInferenceContext( - max_batch_size=self.batch_size, max_sequence_length=self.sequence_length + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=self.hidden_size, + inference_batch_times_seqlen_threshold=400, + inference_max_requests=self.batch_size, + fp32_residual_connection=False, + params_dtype=inference_config_params_dtype, + padded_vocab_size=self.vocab_size, ) - inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + + inference_wrapped_model = GPTInferenceWrapper( + gpt_model, inference_wrapper_config, inference_context + ) self.mock_tokenizer = mock.Mock() # Set required tokenizer attributes before engine creation self.mock_tokenizer.vocab_size = self.vocab_size @@ -188,6 +200,8 @@ def test_generate_dynamic(self, batch_size: int, num_trials: int, empty_prompt: assert len(results) == batch_size for result in results: + if isinstance(result, DynamicInferenceRequestRecord): + result = result.merge() assert isinstance(result, InferenceRequest), ( "expected ; found <%s>." % type(result).__name__ ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py index d7ddaa1e680..07afebe1067 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py @@ -10,6 +10,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, @@ -50,15 +53,27 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): post_process=parallel_state.is_pipeline_last_stage(), ).cuda() - inference_context = StaticInferenceContext(self.batch_size, self.sequence_length) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=20, + inference_max_requests=self.batch_size, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size, + ) - self.inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + + self.inference_wrapped_model = GPTInferenceWrapper( + gpt_model, inference_wrapper_config, inference_context + ) def teardown_method(self, method): Utils.destroy_model_parallel() + # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_small_input_batch() @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) - def test_inference_pipeline_parallel(self, materialize_only_last_token_logits): + def test_inference_pipeline_parallel_small_size(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) batch_prompt_tokens = ( @@ -67,7 +82,7 @@ def test_inference_pipeline_parallel(self, materialize_only_last_token_logits): .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -92,6 +107,42 @@ def test_inference_pipeline_parallel(self, materialize_only_last_token_logits): self.vocab_size, ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" + # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_large_input_batch() + @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) + def test_inference_pipeline_parallel_large_size(self, materialize_only_last_token_logits): + self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) + + batch_prompt_tokens = ( + torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length)) + .int() + .cuda() + ) + self.inference_wrapped_model.prep_model_for_inference() + self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( + materialize_only_last_token_logits + ) + + inference_input = self.inference_wrapped_model.prep_inference_input( + prompts_tokens=batch_prompt_tokens + ) + + inference_input_for_context_window = ( + self.inference_wrapped_model.get_batch_for_context_window(inference_input, 0, 10) + ) + + logits_seq_len = 1 if materialize_only_last_token_logits else 10 + + logits = self.inference_wrapped_model.run_one_forward_step( + inference_input_for_context_window + ) + + if parallel_state.is_pipeline_last_stage(): + assert logits.shape == ( + self.batch_size, + logits_seq_len, + self.vocab_size, + ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" + @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1) @@ -102,7 +153,7 @@ def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py index eb06f6ed78b..36d5187b5eb 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py @@ -1,5 +1,3 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - from argparse import Namespace from copy import deepcopy from unittest import mock @@ -9,6 +7,9 @@ from megatron.core import parallel_state from megatron.core.inference.contexts import StaticInferenceContext +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -76,9 +77,19 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): add_decoder=True, ).cuda() - inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=-1, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size, + ) + + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) + self.inference_wrapped_model = T5InferenceWrapper( + t5_model, inference_wrapper_config, inference_context + ) def teardown_method(self, method): Utils.destroy_model_parallel() diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py new file mode 100644 index 00000000000..794634760d0 --- /dev/null +++ b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py @@ -0,0 +1,21 @@ +import torch + +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) + + +class TestModelInferenceWrapperConfig: + + def test_inference_config(self): + inference_config = InferenceWrapperConfig( + hidden_size=10, + inference_batch_times_seqlen_threshold=10, + padded_vocab_size=10, + params_dtype=torch.float, + fp32_residual_connection=False, + ) + inference_config.add_attributes({"abc": 45}) + assert ( + inference_config.abc == 45 + ), f"min tokens not set correctly. it is {inference_config.min_tokens}" diff --git a/tests/unit_tests/inference/test_inference_config.py b/tests/unit_tests/inference/test_inference_config.py deleted file mode 100644 index 6d58328dade..00000000000 --- a/tests/unit_tests/inference/test_inference_config.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -import dataclasses - -from megatron.core.inference.config import InferenceConfig -from megatron.core.transformer.transformer_config import TransformerConfig - - -class TestInferenceConfig: - def test_mutual_exclusivity_with_transformer_config(self): - """ - Ensure mutual exclusivity between fields in `InferenceConfig` and - `TransformerConfig`. - """ - dynamic_inference_config_fields = set(dataclasses.fields(InferenceConfig)) - transformer_config_fields = set(dataclasses.fields(TransformerConfig)) - assert len(dynamic_inference_config_fields.intersection(transformer_config_fields)) == 0 diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py index 1417926f13b..cab464af503 100644 --- a/tests/unit_tests/inference/test_wandb_logging.py +++ b/tests/unit_tests/inference/test_wandb_logging.py @@ -7,7 +7,6 @@ import pytest import torch -from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -16,7 +15,6 @@ TextGenerationController, ) from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed -from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils @@ -52,26 +50,20 @@ def _get_dynamic_context( max_sequence_length=512, buffer_size_gb=0.03, block_size_tokens=128, - logging_step_interval=0, metrics_writer=None, ): """Helper to create a DynamicInferenceContext.""" return DynamicInferenceContext( - model_config=TransformerConfig( - params_dtype=params_dtype, - num_layers=num_layers, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - ), - inference_config=InferenceConfig( - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - buffer_size_gb=buffer_size_gb, - block_size_tokens=block_size_tokens, - unified_memory_level=0, # unit tests currently broken with UVM - logging_step_interval=logging_step_interval, - metrics_writer=metrics_writer, - ), + params_dtype=params_dtype, + num_layers=num_layers, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + buffer_size_gb=buffer_size_gb, + block_size_tokens=block_size_tokens, + metrics_writer=metrics_writer, + unified_memory_level=0, # unit tests currently broken with UVM ) @pytest.mark.internal @@ -203,14 +195,12 @@ def test_kvcache_utilization_stats_types(self): @pytest.mark.internal @patch('megatron.core.inference.engines.dynamic_engine.HAVE_WANDB', True) def test_engine_logging_step_interval_zero(self): - """Test that no logging occurs when logging_step_interval is 0.""" + """Test that no logging occurs when inference_logging_step_interval is 0.""" mock_wandb = Mock() mock_wandb.__name__ = "wandb" mock_wandb.log = Mock() - dynamic_context = self._get_dynamic_context( - logging_step_interval=0, metrics_writer=mock_wandb - ) + dynamic_context = self._get_dynamic_context(metrics_writer=mock_wandb) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -220,7 +210,12 @@ def test_engine_logging_step_interval_zero(self): mock_controller.inference_wrapped_model.model.config = Mock() mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" - engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) + engine = DynamicInferenceEngine( + controller=mock_controller, + context=dynamic_context, + random_seed=123, + inference_logging_step_interval=0, # Disabled + ) # Verify log was never called mock_wandb.log.assert_not_called() @@ -230,16 +225,15 @@ def test_paused_requests_in_stats(self): """Test that paused requests are correctly reflected in stats.""" set_rounder(1) dynamic_context = DynamicInferenceContext( - model_config=TransformerConfig( - params_dtype=torch.float32, num_layers=2, kv_channels=64, num_attention_heads=8 - ), - inference_config=InferenceConfig( - max_sequence_length=128, - num_cuda_graphs=None, - buffer_size_gb=0.01, # Small buffer to force pausing - block_size_tokens=32, - unified_memory_level=0, # unit tests currently broken with UVM - ), + params_dtype=torch.float32, + num_layers=2, + kv_channels=64, + num_attention_heads=8, + max_sequence_length=128, + num_cuda_graphs=None, + buffer_size_gb=0.01, # Small buffer to force pausing + block_size_tokens=32, + unified_memory_level=0, # unit tests currently broken with UVM ) # Add multiple requests to potentially trigger pausing @@ -263,7 +257,7 @@ def test_paused_requests_in_stats(self): @pytest.mark.internal def test_metrics_writer_none_handling(self): """Test that engine handles None metrics_writer gracefully.""" - dynamic_context = self._get_dynamic_context(logging_step_interval=10, metrics_writer=None) + dynamic_context = self._get_dynamic_context(metrics_writer=None) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -274,8 +268,13 @@ def test_metrics_writer_none_handling(self): mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" # Should not raise error even with logging interval set - engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) + engine = DynamicInferenceEngine( + controller=mock_controller, + context=dynamic_context, + random_seed=123, + inference_logging_step_interval=10, + ) # Verify engine was created successfully - assert engine.logging_step_interval == 10 - assert engine.metrics_writer is None + assert engine.inference_logging_step_interval == 10 + assert engine.context.metrics_writer is None diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py index 5bd39ec1324..93a208710fc 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py @@ -1,5 +1,3 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - import random import string import time @@ -14,6 +12,9 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -84,9 +85,19 @@ def setup_method(self, method): add_decoder=True, ).cuda() - inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=-1, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size, + ) + + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) + inference_wrapped_model = T5InferenceWrapper( + t5_model, inference_wrapper_config, inference_context + ) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py similarity index 96% rename from tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py rename to tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py index bdf95c2d9bf..0885401e7a0 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py @@ -14,7 +14,6 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError from megatron.core.inference.inference_request import ( @@ -25,6 +24,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -98,24 +100,37 @@ def setup_model( if dtype == torch.bfloat16: gpt_model = Float16Module(gpt_model.config, gpt_model) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=self.hidden_size, + inference_batch_times_seqlen_threshold=-1, + inference_max_seq_length=2048, + inference_max_requests=16 if fp8 else self.batch_size, + fp32_residual_connection=False, + params_dtype=dtype, + padded_vocab_size=self.vocab_size, + ) + if static: - inference_context = StaticInferenceContext( - max_batch_size=16 if fp8 else self.batch_size, max_sequence_length=2048 - ) + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) else: inference_context = DynamicInferenceContext( - model_config=transformer_config, - inference_config=InferenceConfig( - max_sequence_length=2048, - buffer_size_gb=0.2, - materialize_only_last_token_logits=False, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM - ), + params_dtype=dtype, + num_layers=transformer_config.num_layers // pipeline_model_parallel_size, + kv_channels=transformer_config.kv_channels, + num_attention_heads=transformer_config.num_attention_heads, + tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, + pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, + max_sequence_length=2048, + buffer_size_gb=0.2, + materialize_only_last_token_logits=False, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) - inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) + inference_wrapped_model = GPTInferenceWrapper( + gpt_model, inference_wrapper_config, inference_context + ) inference_wrapped_model.model_is_pipeline_parallel = not ( parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py index 50db5cc0afc..31bf415ba56 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py @@ -1,5 +1,3 @@ -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - import copy import os import random @@ -15,6 +13,9 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status, VLMInferenceRequest +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import ( VLMInferenceWrapper, ) @@ -91,9 +92,19 @@ def setup_method(self, method): self.image_token_index = self.model.image_token_index self.model = Float16Module(self.model.config, self.model) - inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=self.language_hidden_size, + inference_batch_times_seqlen_threshold=-1, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.language_vocab_size, + ) + + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - inference_wrapped_model = VLMInferenceWrapper(self.model, inference_context) + inference_wrapped_model = VLMInferenceWrapper( + self.model, inference_wrapper_config, inference_context + ) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py index 87aba9c6ed9..cf3bd40ee4b 100644 --- a/tests/unit_tests/models/test_gpt_model.py +++ b/tests/unit_tests/models/test_gpt_model.py @@ -12,7 +12,6 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid -from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams @@ -393,18 +392,14 @@ def test_dynamic_inference_padding_with_fp8(self): config = self.gpt_model.config inference_context = DynamicInferenceContext( - model_config=TransformerConfig( - params_dtype=config.params_dtype, - num_layers=config.num_layers, - kv_channels=config.hidden_size // config.num_attention_heads, - num_attention_heads=config.num_attention_heads, - ), - inference_config=InferenceConfig( - max_sequence_length=self.gpt_model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, - ), + params_dtype=config.params_dtype, + num_layers=config.num_layers, + kv_channels=config.hidden_size // config.num_attention_heads, + num_attention_heads=config.num_attention_heads, + max_sequence_length=self.gpt_model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, ) # Add a request with 10 tokens. Since 10 is not a multiple of 64, diff --git a/tests/unit_tests/models/test_gpt_model_batch_invariant.py b/tests/unit_tests/models/test_gpt_model_batch_invariant.py index 9ab7e445c0d..ead9125e5ec 100644 --- a/tests/unit_tests/models/test_gpt_model_batch_invariant.py +++ b/tests/unit_tests/models/test_gpt_model_batch_invariant.py @@ -5,15 +5,17 @@ import torch import torch.distributed as dist -from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( - TextGenerationController, +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( + SimpleTextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.models.gpt.gpt_model import GPTModel @@ -89,8 +91,6 @@ def _build_flash_attn_bik_model(seq_len: int, vocab_size: int, hidden_size: int normalization="RMSNorm", params_dtype=torch.bfloat16, attention_backend=AttnBackend.flash, - fp32_residual_connection=False, - nccl_all_reduce_for_prefill=False, ) cfg.fp16 = False cfg.bf16 = True @@ -184,21 +184,32 @@ def test_dynamic_engine_matches_batched_forward_rl(self): inference_model = Float16Module(base_model.config, base_model).cuda().eval() ctx = DynamicInferenceContext( - model_config=base_model.config, - inference_config=InferenceConfig( - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, - ), + params_dtype=torch.bfloat16, + num_layers=base_model.config.num_layers, + kv_channels=base_model.config.kv_channels, + num_attention_heads=base_model.config.num_attention_heads, + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, ) - wrapper = GPTInferenceWrapper(inference_model, ctx) + wrapper_cfg = InferenceWrapperConfig( + hidden_size=base_model.config.hidden_size, + inference_batch_times_seqlen_threshold=-1, + fp32_residual_connection=False, + params_dtype=torch.bfloat16, + padded_vocab_size=vocab_size, + inference_max_seq_length=seq_len, + inference_max_requests=8, + nccl_all_reduce_for_prefill=False, + ) + wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = TextGenerationController(wrapper, tokenizer) + controller = SimpleTextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) @@ -262,21 +273,32 @@ def test_dynamic_engine_is_batch_invariant(self): def _run_engine_with_order(order): ctx = DynamicInferenceContext( - model_config=based_model.config, - inference_config=InferenceConfig( - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, - ), + params_dtype=torch.bfloat16, + num_layers=base_model.config.num_layers, + kv_channels=base_model.config.kv_channels, + num_attention_heads=base_model.config.num_attention_heads, + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, ) - wrapper = GPTInferenceWrapper(inference_model, ctx) + wrapper_cfg = InferenceWrapperConfig( + hidden_size=base_model.config.hidden_size, + inference_batch_times_seqlen_threshold=-1, + fp32_residual_connection=False, + params_dtype=torch.bfloat16, + padded_vocab_size=vocab_size, + inference_max_seq_length=seq_len, + inference_max_requests=8, + nccl_all_reduce_for_prefill=False, + ) + wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = TextGenerationController(wrapper, tokenizer) + controller = SimpleTextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py index 29e3630d7bb..9eb7b2dea9a 100644 --- a/tests/unit_tests/models/test_mamba_model.py +++ b/tests/unit_tests/models/test_mamba_model.py @@ -10,7 +10,6 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid -from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts import BaseInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -22,7 +21,12 @@ from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module -from megatron.core.utils import divide, is_fa_min_version, is_torch_min_version +from megatron.core.utils import ( + divide, + get_mamba_inference_state_config_from_model, + is_fa_min_version, + is_torch_min_version, +) from tests.unit_tests.test_utilities import Utils @@ -340,17 +344,20 @@ def test_dynamic_inference_padding_with_fp8(self): self.model.eval() config = self.model.config - mamba_inference_state_config = MambaInferenceStateConfig.from_model(self.model.module) + mamba_inference_state_config = get_mamba_inference_state_config_from_model( + self.model.module + ) inference_context = DynamicInferenceContext( - model_config=self.model.config, - inference_config=InferenceConfig( - max_sequence_length=self.model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, - mamba_inference_state_config=mamba_inference_state_config, - ), + params_dtype=config.params_dtype, + num_layers=config.num_layers, + kv_channels=config.hidden_size // config.num_attention_heads, + num_attention_heads=config.num_attention_heads, + max_sequence_length=self.model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, + mamba_inference_state_config=mamba_inference_state_config, ) # Add a request with 10 tokens. Since 10 is not a multiple of 64 (TOKEN_ROUNDER), diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index a5590a0ffad..3c7ae93a17c 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -166,7 +166,6 @@ "moe_layer_freq": 1, "moe_layer_recompute": False, "moe_pad_expert_input_to_capacity": False, - "moe_pad_experts_for_cuda_graph_inference": False, "moe_per_layer_logging": False, "moe_permute_fusion": False, "moe_router_bias_update_rate": 0.001, @@ -198,7 +197,6 @@ "mtp_num_layers": None, "mtp_standalone": False, "multi_latent_attention": False, - "nccl_all_reduce_for_prefill": False, "no_rope_freq": None, "no_sync_func": None, "normalization": "RMSNorm", diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py index 74f1e69679e..615073b8fd0 100644 --- a/tools/run_dynamic_text_generation_server.py +++ b/tools/run_dynamic_text_generation_server.py @@ -5,19 +5,25 @@ import torch +from examples.inference.gpt.gpt_dynamic_inference import ( + add_dynamic_inference_args, + get_inference_context, + get_inference_controller, + get_model, +) from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.text_generation_server.dynamic_text_gen_server import run_flask_server -from megatron.core.utils import trace_async_exceptions -from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine +from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.utils import get_mamba_inference_state_config_from_model, trace_async_exceptions from megatron.post_training.arguments import add_modelopt_args -from megatron.training import get_args +from megatron.training import get_args, get_tokenizer from megatron.training.initialize import initialize_megatron def add_text_generation_server_args(parser: argparse.ArgumentParser): """Adds the required command line arguments for running the text generation server.""" parser = add_modelopt_args(parser) - parser = add_inference_args(parser) + parser = add_dynamic_inference_args(parser) parser.add_argument("--port", type=int, default=5000, help="Port for Flask server to run on") return parser @@ -68,12 +74,36 @@ async def run_text_generation_server( args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) + args = get_args() + model = get_model() + + if args.legacy_tokenizer: + tokenizer = get_tokenizer() + else: + tokenizer = build_tokenizer(args) + + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + # Enable return_log_probs to allow prompt logprobs computation for echo=True requests # This sets materialize_only_last_token_logits=False in the inference context, # which is required for lm-eval compatibility (loglikelihood evaluation tasks) - args = get_args() args.return_log_probs = True - engine = get_dynamic_inference_engine() + context = get_inference_context( + None, + None, + calculate_max_sequence_length_from_requests=False, + mamba_inference_state_config=mamba_inference_state_config, + ) + + controller = get_inference_controller(model, context) + + engine = DynamicInferenceEngine( + controller, + context, + enable_cuda_graph=args.cuda_graph_impl == "local", + random_seed=args.seed, + enable_chunked_prefill=not args.disable_chunked_prefill, + ) asyncio.run(run_text_generation_server(engine, args.inference_coordinator_port, args.port)) diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py index 430bb7ebb9a..32d61444530 100644 --- a/tools/run_inference_performance_test.py +++ b/tools/run_inference_performance_test.py @@ -10,31 +10,33 @@ from gpt_builders import gpt_builder from mamba_builders import mamba_builder -from megatron.core.inference.contexts import StaticInferenceContext +from megatron.core.inference.contexts import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, StaticInferenceEngine from megatron.core.inference.engines.abstract_engine import AbstractEngine -from megatron.core.inference.inference_request import ( - DynamicInferenceRequestRecord, - InferenceRequest, -) +from megatron.core.inference.inference_request import InferenceRequest from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.transformer.module import MegatronModule -from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine +from megatron.core.utils import get_mamba_inference_state_config_from_model from model_provider import model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) +import asyncio from functools import partial -from typing import List +from typing import List, Union +from examples.inference.gpt.utils import add_common_inference_args from megatron.core import mpu from megatron.training import get_args, get_model, get_tokenizer from megatron.training.checkpointing import load_checkpoint @@ -45,7 +47,7 @@ def add_inference_benchmarking_args(parser): """Inference benchmarking arguments.""" - parser = add_inference_args(parser) + parser = add_common_inference_args(parser) group = parser.add_argument_group(title='inference_benchmarking') @@ -58,6 +60,7 @@ def add_inference_benchmarking_args(parser): group.add_argument( "--benchmark-profile", action="store_true", default=False, help="If set, profile" ) + group.add_argument('--stream', action="store_true", default=False, help="If set, stream tokens") return parser @@ -71,13 +74,24 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs Returns: AbstractBackend: The chosen backend """ + tokenizer = get_tokenizer() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + inference_max_requests=args.inference_max_batch_size, + inference_max_seq_length=args.inference_max_seq_length, + nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, + moe_pad_experts_for_cuda_graph_inference=args.moe_pad_experts_for_cuda_graph_inference, + ) + + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) if args.engine_type == "static": - tokenizer = get_tokenizer() - context = StaticInferenceContext( - args.inference_max_requests, args.inference_max_sequence_length - ) - inference_wrapped_model = GPTInferenceWrapper(model, context) + inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) inference_wrapped_model.model_is_pipeline_parallel = not ( mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() ) @@ -86,7 +100,98 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs ) return StaticInferenceEngine(text_generation_controller=text_generation_controller) elif args.engine_type == "dynamic": - return get_dynamic_inference_engine(model=model) + context = DynamicInferenceContext( + params_dtype=args.params_dtype, + num_layers=args.num_layers, + kv_channels=args.kv_channels, + num_attention_heads=( + args.num_query_groups if args.group_query_attention else args.num_attention_heads + ), + max_sequence_length=args.inference_max_seq_length, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs + if args.cuda_graph_impl == "local" + else None + ), + buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, + buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, + buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, + max_requests_override=args.inference_dynamic_batching_max_requests_override, + max_tokens_override=args.inference_dynamic_batching_max_tokens_override, + block_size_tokens=args.inference_dynamic_batching_block_size, + tensor_model_parallel_size=args.tensor_model_parallel_size, + pipeline_model_parallel_size=args.pipeline_model_parallel_size, + materialize_only_last_token_logits=not args.return_log_probs, + mamba_inference_state_config=mamba_inference_state_config, + cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, + kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, + qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + ) + inference_wrapped_model = GPTInferenceWrapper( + model, inference_wrapper_config, inference_context=context + ) + inference_wrapped_model.model_is_pipeline_parallel = not ( + mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() + ) + text_generation_controller = TextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer + ) + return DynamicInferenceEngine( + text_generation_controller, + context, + termination_id=-1, + enable_cuda_graph=args.cuda_graph_impl == "local", + random_seed=args.seed, + ) + + +async def generate( + inference_engine: Union[StaticInferenceEngine, DynamicInferenceEngine], + sampling_params: SamplingParams, + prompts: List[str], + inference_requests: List[InferenceRequest] = None, +) -> List[InferenceRequest]: + async def collect_stream(prompt, request_id, stream_generator): + async for output in stream_generator: + pass + + if inference_requests is None: + assert prompts is not None + inference_requests = [None for _ in range(len(prompts))] + elif prompts is None: + assert inference_requests is not None + tokenizer = get_tokenizer() + prompts = [tokenizer.detokenize(request.prompt_tokens) for request in inference_requests] + + request_ids: List[int] = [ + inference_engine.add_request( + prompt=prompt, + inference_request=inference_request, + inference_parameters=sampling_params, + streaming=True, + ) + for prompt, inference_request in zip(prompts, inference_requests) + ] + stream_generators = [ + inference_engine.get_stream_generator(request_id) for request_id in request_ids + ] + + tasks = [ + asyncio.create_task(collect_stream(prompt, request_id, stream_generator)) + for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators) + ] + + await inference_engine.run_engine_async() + await asyncio.gather(*tasks) + + results: List[InferenceRequest] = [ + inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids + ] + + return results def get_random_prompt_tokens(tokenizer, num_input_tokens) -> List[int]: @@ -127,12 +232,14 @@ def generate_dynamic( request_id = REQUEST_ID REQUEST_ID += 1 prompt_tokens = request.prompt_tokens - inference_engine.add_request(request_id, prompt_tokens, request.inference_parameters) + inference_engine.add_request( + request_id, prompt_tokens, request.inference_parameters, + ) start_time = time.perf_counter() all_finished_requests = [] while inference_engine.has_unfinished_requests(): - result = inference_engine.step() + result = inference_engine.step(verbose=False) finished_requests = result["finished_requests"] for request in finished_requests: req_id = request.request_id @@ -150,6 +257,8 @@ def generate_dynamic( def main(): """Main program.""" + # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) + # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) initialize_megatron( extra_args_provider=add_inference_benchmarking_args, args_defaults={ @@ -189,14 +298,13 @@ def main(): return_log_probs=args.return_log_probs, top_n_logprobs=args.top_n_logprobs, num_tokens_to_generate=args.num_tokens_to_generate, - termination_id=-1, ) sampling_params.add_attributes({"no_early_termination": True}) requests = [] if args.num_input_tokens is not None: assert args.prompts is None - batch_size = args.inference_max_requests + batch_size = args.inference_max_batch_size for i in range(batch_size): prompt_tokens = get_random_prompt_tokens(tokenizer, args.num_input_tokens) requests.append( @@ -219,27 +327,33 @@ def main(): ) ) - # TODO(ksanthanam): Use a command line argument for warmup iterations - for i in range(3): - print(f"Running warmup iteration {i+1}...") - warmup_sampling_params = SamplingParams(num_tokens_to_generate=10, termination_id=-1) + if args.cuda_graph_impl == "local": + print(f"Running warmup for CUDA graphs...") + warmup_sampling_params = SamplingParams(num_tokens_to_generate=10) + warmup_sampling_params.add_attributes({"no_early_termination": True}) inference_engine.generate(prompts=["warmup"], sampling_params=warmup_sampling_params) if args.benchmark_profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() - if args.engine_type == "static": - results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params + if args.stream: + if args.engine_type == "dynamic": + raise NotImplementedError("Streaming not supported with DynamicInferenceEngine") + results: List[InferenceRequest] = asyncio.run( + generate( + inference_engine, sampling_params, prompts=args.prompts, inference_requests=requests + ) ) else: - prompts = [request.prompt_tokens for request in requests] - records: List[DynamicInferenceRequestRecord] = inference_engine.generate( - prompts=prompts, sampling_params=sampling_params - ) - results: List[InferenceRequest] = [record.merge() for record in records] - + if args.engine_type == "static": + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params + ) + elif args.engine_type == "dynamic": + results: List[InferenceRequest] = generate_dynamic( + args, requests, inference_engine, + ) end_time = time.perf_counter() latency = end_time - start_time @@ -264,10 +378,6 @@ def main(): result_dict['generated_output'] = tokenizer.detokenize(result.generated_tokens) print(result_dict) - total_output_tokens = args.num_tokens_to_generate * args.inference_max_requests - throughput = total_output_tokens / latency - print(f"Throughput: {throughput} output tokens / second") - if __name__ == "__main__": main() diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index 89c1cfa5b86..350173dc16f 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -22,6 +22,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -60,15 +63,27 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi tokenizer = get_tokenizer() - inference_context = StaticInferenceContext(args.inference_max_requests, args.inference_max_sequence_length) + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + inference_max_seq_length=args.inference_max_seq_length, + inference_max_requests=args.inference_max_batch_size, + nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, + moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference + ) + inference_context = StaticInferenceContext.from_config(inference_wrapper_config) inference_wrapped_model = GPTInferenceWrapper( - model, inference_context + model, inference_wrapper_config, inference_context ) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) return StaticInferenceEngine( text_generation_controller=text_generation_controller, + max_batch_size=args.inference_max_batch_size, ) @@ -151,6 +166,14 @@ def main(model_type: str = "gpt"): model = model[0] model.eval() + if args.max_batch_size is not None: + assert args.inference_max_batch_size is not None + args.inference_max_batch_size = max(args.inference_max_batch_size, args.max_batch_size) + warnings.warn( + "`--max-batch-size` has been deprecated in favor of `--inference-max-requests`, " + f"setting maximum batch size to {args.inference_max_batch_size}" + ) + inference_engine = get_inference_engine(args, model) if args.cuda_graph_impl == "local": diff --git a/train_rl.py b/train_rl.py index 4b5cec5fcc8..cfc010b3c04 100644 --- a/train_rl.py +++ b/train_rl.py @@ -370,8 +370,6 @@ def __getitem__(self, idx): if __name__ == "__main__": - from megatron.inference.utils import add_inference_args - # Temporary for transition to core datasets train_valid_test_datasets_provider.is_distributed = True @@ -403,5 +401,4 @@ def _model_builder( ModelType.encoder_or_decoder, forward_step, args_defaults={}, - extra_args_provider=add_inference_args, ) From 69a5c63241026a61de27905ebc8d3ab803d5d51c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 31 Jan 2026 15:34:56 +0100 Subject: [PATCH 017/231] ci: Fix DSv3 (#3188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_dev_dgx_h100.json | 11492 ---------------- .../golden_values_dev_dgx_h100.json | 11492 ---------------- .../model_config.yaml} | 0 .../model_config.yaml} | 0 4 files changed, 22984 deletions(-) delete mode 100644 tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json rename tests/functional_tests/test_cases/mixtral/{deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml => deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml} (100%) rename tests/functional_tests/test_cases/mixtral/{deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml => deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml} (100%) diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json deleted file mode 100644 index f486950e5a2..00000000000 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,11492 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 13.89756, - "5": 13.89155, - "10": 13.85814, - "15": 13.84947, - "20": 13.74128, - "25": 13.71269, - "30": 13.39136, - "35": 13.32418, - "40": 13.23329, - "45": 13.12045, - "50": 12.53632, - "55": 12.35058, - "60": 12.17187, - "65": 12.01029, - "70": 11.83519, - "75": 11.55823, - "80": 11.30557, - "85": 11.11711, - "90": 10.96045, - "95": 10.79835, - "100": 10.58719, - "105": 10.45871, - "110": 10.23985, - "115": 10.03197, - "120": 9.88087, - "125": 9.74001, - "130": 9.64895, - "135": 9.58316, - "140": 9.34895, - "145": 9.3363, - "150": 9.17736, - "155": 9.11162, - "160": 9.02957, - "165": 8.91504, - "170": 8.86399, - "175": 8.82531, - "180": 8.68067, - "185": 8.72019, - "190": 8.59287, - "195": 8.59803, - "200": 8.48665, - "205": 8.39681, - "210": 8.35424, - "215": 8.40636, - "220": 8.27837, - "225": 8.29496, - "230": 8.27773, - "235": 8.20463, - "240": 8.15385, - "245": 8.1344, - "250": 8.06891, - "255": 8.08354, - "260": 7.97761, - "265": 7.96264, - "270": 7.91745, - "275": 7.9055, - "280": 7.89502, - "285": 7.91233, - "290": 7.858, - "295": 7.84326, - "300": 7.73922, - "305": 7.73479, - "310": 7.6998, - "315": 7.6959, - "320": 7.68835, - "325": 7.60857, - "330": 7.59888, - "335": 7.57833, - "340": 7.62257, - "345": 7.51187, - "350": 7.5063, - "355": 7.43406, - "360": 7.53414, - "365": 7.45759, - "370": 7.49186, - "375": 7.43607, - "380": 7.41292, - "385": 7.41117, - "390": 7.42986, - "395": 7.36781, - "400": 7.30747, - "405": 7.31834, - "410": 7.30943, - "415": 7.29421, - "420": 7.2965, - "425": 7.26158, - "430": 7.20979, - "435": 7.22197, - "440": 7.18512, - "445": 7.1687, - "450": 7.12181, - "455": 7.14062, - "460": 7.11041, - "465": 7.10497, - "470": 7.07645, - "475": 7.09742, - "480": 6.97587, - "485": 7.03312, - "490": 6.99478, - "495": 6.9692, - "500": 6.91435, - "505": 6.94713, - "510": 6.92309, - "515": 6.88853, - "520": 6.88024, - "525": 6.87529, - "530": 6.88311, - "535": 6.8642, - "540": 6.78769, - "545": 6.8252, - "550": 6.84568, - "555": 6.86869, - "560": 6.81372, - "565": 6.74969, - "570": 6.76579, - "575": 6.77872, - "580": 6.69766, - "585": 6.71359, - "590": 6.65449, - "595": 6.64792, - "600": 6.67016, - "605": 6.65924, - "610": 6.63641, - "615": 6.68438, - "620": 6.60355, - "625": 6.57203, - "630": 6.56964, - "635": 6.60732, - "640": 6.59738, - "645": 6.5815, - "650": 6.62582, - "655": 6.62475, - "660": 6.53171, - "665": 6.52224, - "670": 6.47146, - "675": 6.57058, - "680": 6.53989, - "685": 6.49695, - "690": 6.47037, - "695": 6.43685, - "700": 6.43121, - "705": 6.4313, - "710": 6.46058, - "715": 6.46842, - "720": 6.35254, - "725": 6.40344, - "730": 6.39123, - "735": 6.41174, - "740": 6.34886, - "745": 6.31567, - "750": 6.37227, - "755": 6.29068, - "760": 6.30783, - "765": 6.32016, - "770": 6.31539, - "775": 6.3051, - "780": 6.27484, - "785": 6.28635, - "790": 6.25066, - "795": 6.24498, - "800": 6.22595, - "805": 6.30241, - "810": 6.16125, - "815": 6.18921, - "820": 6.19984, - "825": 6.20878, - "830": 6.21184, - "835": 6.16547, - "840": 6.13918, - "845": 6.18907, - "850": 6.14544, - "855": 6.14245, - "860": 6.12573, - "865": 6.14471, - "870": 6.103, - "875": 6.14755, - "880": 6.09503, - "885": 6.08625, - "890": 6.14906, - "895": 6.03612, - "900": 6.06033, - "905": 6.07119, - "910": 6.04765, - "915": 6.02795, - "920": 6.01922, - "925": 6.00762, - "930": 6.04202, - "935": 6.03448, - "940": 5.96552, - "945": 6.00691, - "950": 6.02802, - "955": 5.9757, - "960": 5.9732, - "965": 5.8947, - "970": 5.93848, - "975": 5.94046, - "980": 5.91694, - "985": 5.91057, - "990": 5.96163, - "995": 5.87028, - "1000": 5.89819, - "1005": 5.85552, - "1010": 5.89001, - "1015": 5.91011, - "1020": 5.82121, - "1025": 5.81525, - "1030": 5.82852, - "1035": 5.91121, - "1040": 5.83477, - "1045": 5.80641, - "1050": 5.84029, - "1055": 5.82471, - "1060": 5.77657, - "1065": 5.75965, - "1070": 5.80228, - "1075": 5.78852, - "1080": 5.77993, - "1085": 5.79347, - "1090": 5.7642, - "1095": 5.77727, - "1100": 5.73679, - "1105": 5.71252, - "1110": 5.76864, - "1115": 5.69994, - "1120": 5.64073, - "1125": 5.65212, - "1130": 5.71653, - "1135": 5.67194, - "1140": 5.66144, - "1145": 5.65572, - "1150": 5.68319, - "1155": 5.64543, - "1160": 5.63371, - "1165": 5.67226, - "1170": 5.65589, - "1175": 5.62136, - "1180": 5.63006, - "1185": 5.6181, - "1190": 5.60413, - "1195": 5.59825, - "1200": 5.54202, - "1205": 5.65572, - "1210": 5.51312, - "1215": 5.55359, - "1220": 5.63431, - "1225": 5.51403, - "1230": 5.56754, - "1235": 5.521, - "1240": 5.55808, - "1245": 5.52886, - "1250": 5.51046, - "1255": 5.50279, - "1260": 5.50208, - "1265": 5.47964, - "1270": 5.44537, - "1275": 5.52448, - "1280": 5.45447, - "1285": 5.4682, - "1290": 5.43648, - "1295": 5.46181, - "1300": 5.46016, - "1305": 5.43278, - "1310": 5.38271, - "1315": 5.44073, - "1320": 5.42393, - "1325": 5.3568, - "1330": 5.41966, - "1335": 5.39498, - "1340": 5.44678, - "1345": 5.4046, - "1350": 5.3745, - "1355": 5.36722, - "1360": 5.37555, - "1365": 5.38819, - "1370": 5.31687, - "1375": 5.3257, - "1380": 5.37435, - "1385": 5.33822, - "1390": 5.32907, - "1395": 5.35996, - "1400": 5.34708, - "1405": 5.32768, - "1410": 5.30321, - "1415": 5.26874, - "1420": 5.31115, - "1425": 5.3045, - "1430": 5.33954, - "1435": 5.24914, - "1440": 5.27894, - "1445": 5.31118, - "1450": 5.28087, - "1455": 5.30455, - "1460": 5.26455, - "1465": 5.26355, - "1470": 5.29615, - "1475": 5.27116, - "1480": 5.26692, - "1485": 5.21939, - "1490": 5.21283, - "1495": 5.23155, - "1500": 5.23275, - "1505": 5.20436, - "1510": 5.22447, - "1515": 5.15502, - "1520": 5.1852, - "1525": 5.15413, - "1530": 5.17452, - "1535": 5.16098, - "1540": 5.16276, - "1545": 5.19593, - "1550": 5.1989, - "1555": 5.18478, - "1560": 5.1253, - "1565": 5.15973, - "1570": 5.17281, - "1575": 5.1468, - "1580": 5.16002, - "1585": 5.14495, - "1590": 5.12815, - "1595": 5.09691, - "1600": 5.17173, - "1605": 5.09626, - "1610": 5.10506, - "1615": 5.09978, - "1620": 5.1145, - "1625": 5.10983, - "1630": 5.08211, - "1635": 5.12902, - "1640": 5.09565, - "1645": 5.08916, - "1650": 5.08067, - "1655": 5.06625, - "1660": 5.05546, - "1665": 5.04609, - "1670": 5.06711, - "1675": 5.06871, - "1680": 5.00775, - "1685": 5.01672, - "1690": 4.99799, - "1695": 5.00065, - "1700": 5.03983, - "1705": 5.01824, - "1710": 5.00629, - "1715": 4.97587, - "1720": 4.97437, - "1725": 4.9984, - "1730": 4.95014, - "1735": 5.02541, - "1740": 4.95266, - "1745": 4.97461, - "1750": 4.95639, - "1755": 4.97133, - "1760": 4.98489, - "1765": 4.93728, - "1770": 4.93343, - "1775": 4.9432, - "1780": 4.96314, - "1785": 4.91574, - "1790": 4.93944, - "1795": 4.93848, - "1800": 4.88725, - "1805": 4.87771, - "1810": 4.8976, - "1815": 4.89801, - "1820": 4.8872, - "1825": 4.89371, - "1830": 4.8786, - "1835": 4.87542, - "1840": 4.87209, - "1845": 4.85811, - "1850": 4.83484, - "1855": 4.89133, - "1860": 4.84322, - "1865": 4.85108, - "1870": 4.82648, - "1875": 4.83877, - "1880": 4.89485, - "1885": 4.84392, - "1890": 4.8281, - "1895": 4.77339, - "1900": 4.81423, - "1905": 4.81232, - "1910": 4.82991, - "1915": 4.79768, - "1920": 4.78308, - "1925": 4.79277, - "1930": 4.76544, - "1935": 4.7941, - "1940": 4.75875, - "1945": 4.80214, - "1950": 4.83843, - "1955": 4.77731, - "1960": 4.76768, - "1965": 4.72596, - "1970": 4.73388, - "1975": 4.7973, - "1980": 4.73036, - "1985": 4.74162, - "1990": 4.78353, - "1995": 4.74959, - "2000": 4.76948, - "2005": 4.80113, - "2010": 4.70951, - "2015": 4.69715, - "2020": 4.71284, - "2025": 4.75821, - "2030": 4.68831, - "2035": 4.71528, - "2040": 4.67772, - "2045": 4.76255, - "2050": 4.74404, - "2055": 4.7077, - "2060": 4.70614, - "2065": 4.66526, - "2070": 4.67653, - "2075": 4.69507, - "2080": 4.66174, - "2085": 4.69911, - "2090": 4.61739, - "2095": 4.64746, - "2100": 4.61666, - "2105": 4.64633, - "2110": 4.64123, - "2115": 4.65336, - "2120": 4.64559, - "2125": 4.61059, - "2130": 4.61466, - "2135": 4.62745, - "2140": 4.6232, - "2145": 4.58124, - "2150": 4.60983, - "2155": 4.57956, - "2160": 4.60382, - "2165": 4.58415, - "2170": 4.61387, - "2175": 4.60275, - "2180": 4.59531, - "2185": 4.60788, - "2190": 4.58246, - "2195": 4.55672, - "2200": 4.55346, - "2205": 4.56383, - "2210": 4.6146, - "2215": 4.64276, - "2220": 4.59912, - "2225": 4.57263, - "2230": 4.56854, - "2235": 4.61797, - "2240": 4.51401, - "2245": 4.5176, - "2250": 4.52905, - "2255": 4.54117, - "2260": 4.48536, - "2265": 4.56489, - "2270": 4.49655, - "2275": 4.55547, - "2280": 4.51075, - "2285": 4.53333, - "2290": 4.52269, - "2295": 4.52707, - "2300": 4.53228, - "2305": 4.49287, - "2310": 4.53148, - "2315": 4.46329, - "2320": 4.51121, - "2325": 4.49336, - "2330": 4.49351, - "2335": 4.47787, - "2340": 4.48626, - "2345": 4.52525, - "2350": 4.4674, - "2355": 4.47173, - "2360": 4.44099, - "2365": 4.44682, - "2370": 4.44716, - "2375": 4.44199, - "2380": 4.39487, - "2385": 4.43475, - "2390": 4.43071, - "2395": 4.46719, - "2400": 4.42074, - "2405": 4.40081, - "2410": 4.44955, - "2415": 4.42055, - "2420": 4.4293, - "2425": 4.39783, - "2430": 4.42084, - "2435": 4.40291, - "2440": 4.39501, - "2445": 4.40808, - "2450": 4.38239, - "2455": 4.4178, - "2460": 4.36606, - "2465": 4.41327, - "2470": 4.40023, - "2475": 4.41776, - "2480": 4.34092, - "2485": 4.37423, - "2490": 4.37838, - "2495": 4.35662, - "2500": 4.36528, - "2505": 4.37219, - "2510": 4.41251, - "2515": 4.40356, - "2520": 4.34516, - "2525": 4.36214, - "2530": 4.36786, - "2535": 4.36686, - "2540": 4.36548, - "2545": 4.37687, - "2550": 4.30337, - "2555": 4.37244, - "2560": 4.35158, - "2565": 4.30393, - "2570": 4.33393, - "2575": 4.30697, - "2580": 4.30582, - "2585": 4.29358, - "2590": 4.31272, - "2595": 4.28154, - "2600": 4.29867, - "2605": 4.31115, - "2610": 4.32106, - "2615": 4.27768, - "2620": 4.26935, - "2625": 4.30437, - "2630": 4.22434, - "2635": 4.30369, - "2640": 4.30012, - "2645": 4.2581, - "2650": 4.28639, - "2655": 4.26647, - "2660": 4.21474, - "2665": 4.30436, - "2670": 4.26382, - "2675": 4.2306, - "2680": 4.25227, - "2685": 4.25736, - "2690": 4.22986, - "2695": 4.28379, - "2700": 4.19098, - "2705": 4.23853, - "2710": 4.25092, - "2715": 4.23481, - "2720": 4.24356, - "2725": 4.2225, - "2730": 4.22941, - "2735": 4.22363, - "2740": 4.20346, - "2745": 4.18765, - "2750": 4.21101, - "2755": 4.22237, - "2760": 4.22902, - "2765": 4.18298, - "2770": 4.23755, - "2775": 4.17706, - "2780": 4.21186, - "2785": 4.19469, - "2790": 4.21736, - "2795": 4.18988, - "2800": 4.1159, - "2805": 4.16613, - "2810": 4.17076, - "2815": 4.15389, - "2820": 4.1969, - "2825": 4.19241, - "2830": 4.16864, - "2835": 4.17046, - "2840": 4.16148, - "2845": 4.14967, - "2850": 4.16619, - "2855": 4.11805, - "2860": 4.14572, - "2865": 4.17023, - "2870": 4.14096, - "2875": 4.1596, - "2880": 4.08582, - "2885": 4.14242, - "2890": 4.11503, - "2895": 4.15452, - "2900": 4.09735, - "2905": 4.11101, - "2910": 4.10798, - "2915": 4.14914, - "2920": 4.12546, - "2925": 4.10099, - "2930": 4.08522, - "2935": 4.07896, - "2940": 4.09225, - "2945": 4.06113, - "2950": 4.03479, - "2955": 4.03763, - "2960": 4.04955, - "2965": 4.0643, - "2970": 4.08593, - "2975": 4.0941, - "2980": 4.03102, - "2985": 4.07394, - "2990": 4.08923, - "2995": 4.03231, - "3000": 4.0436, - "3005": 4.02568, - "3010": 4.06747, - "3015": 4.02305, - "3020": 4.03992, - "3025": 4.02491, - "3030": 4.0567, - "3035": 4.04059, - "3040": 4.0544, - "3045": 4.04677, - "3050": 4.017, - "3055": 4.00507, - "3060": 3.9904, - "3065": 4.02281, - "3070": 4.03826, - "3075": 3.97211, - "3080": 4.0011, - "3085": 4.00548, - "3090": 4.00887, - "3095": 4.02745, - "3100": 4.01465, - "3105": 3.99035, - "3110": 3.99124, - "3115": 3.92509, - "3120": 4.00505, - "3125": 3.94183, - "3130": 3.96987, - "3135": 3.96132, - "3140": 3.95209, - "3145": 3.93524, - "3150": 3.96949, - "3155": 3.96213, - "3160": 3.96255, - "3165": 3.96146, - "3170": 3.96456, - "3175": 3.93165, - "3180": 3.93784, - "3185": 3.90234, - "3190": 3.92455, - "3195": 3.9116, - "3200": 3.89013, - "3205": 3.92029, - "3210": 3.89711, - "3215": 3.90569, - "3220": 3.89706, - "3225": 3.91097, - "3230": 3.89895, - "3235": 3.91122, - "3240": 3.88912, - "3245": 3.88902, - "3250": 3.84407, - "3255": 3.89259, - "3260": 3.88283, - "3265": 3.92603, - "3270": 3.9052, - "3275": 3.85915, - "3280": 3.88232, - "3285": 3.86652, - "3290": 3.86681, - "3295": 3.83806, - "3300": 3.85349, - "3305": 3.86048, - "3310": 3.85872, - "3315": 3.89673, - "3320": 3.85179, - "3325": 3.84353, - "3330": 3.82539, - "3335": 3.86213, - "3340": 3.81824, - "3345": 3.83129, - "3350": 3.85901, - "3355": 3.8452, - "3360": 3.83241, - "3365": 3.83682, - "3370": 3.82265, - "3375": 3.85232, - "3380": 3.79563, - "3385": 3.81353, - "3390": 3.79143, - "3395": 3.86888, - "3400": 3.83997, - "3405": 3.86197, - "3410": 3.77529, - "3415": 3.72916, - "3420": 3.80048, - "3425": 3.81237, - "3430": 3.84497, - "3435": 3.80796, - "3440": 3.8267, - "3445": 3.7742, - "3450": 3.78787, - "3455": 3.80217, - "3460": 3.78265, - "3465": 3.75891, - "3470": 3.77341, - "3475": 3.77638, - "3480": 3.77988, - "3485": 3.80588, - "3490": 3.76958, - "3495": 3.80315, - "3500": 3.77047, - "3505": 3.77239, - "3510": 3.75092, - "3515": 3.80896, - "3520": 3.79879, - "3525": 3.76372, - "3530": 3.75322, - "3535": 3.76209, - "3540": 3.81796, - "3545": 3.72915, - "3550": 3.79201, - "3555": 3.72604, - "3560": 3.78622, - "3565": 3.7451, - "3570": 3.74254, - "3575": 3.71868, - "3580": 3.77066, - "3585": 3.76174, - "3590": 3.68853, - "3595": 3.76509, - "3600": 3.71336, - "3605": 3.71948, - "3610": 3.70916, - "3615": 3.74868, - "3620": 3.7837, - "3625": 3.71964, - "3630": 3.76519, - "3635": 3.68617, - "3640": 3.7093, - "3645": 3.74263, - "3650": 3.69638, - "3655": 3.72074, - "3660": 3.72832, - "3665": 3.74694, - "3670": 3.71178, - "3675": 3.71065, - "3680": 3.72416, - "3685": 3.67473, - "3690": 3.6936, - "3695": 3.68528, - "3700": 3.70814, - "3705": 3.67651, - "3710": 3.68493, - "3715": 3.6842, - "3720": 3.66563, - "3725": 3.64716, - "3730": 3.64883, - "3735": 3.68782, - "3740": 3.6732, - "3745": 3.66354, - "3750": 3.6757, - "3755": 3.66351, - "3760": 3.67285, - "3765": 3.66004, - "3770": 3.6516, - "3775": 3.63831, - "3780": 3.62453, - "3785": 3.6765, - "3790": 3.60163, - "3795": 3.64291, - "3800": 3.63275, - "3805": 3.62032, - "3810": 3.59475, - "3815": 3.63585, - "3820": 3.64099, - "3825": 3.6535, - "3830": 3.63864, - "3835": 3.59938, - "3840": 3.67685, - "3845": 3.65895, - "3850": 3.60064, - "3855": 3.60428, - "3860": 3.65711, - "3865": 3.60867, - "3870": 3.6721, - "3875": 3.58596, - "3880": 3.58212, - "3885": 3.60502, - "3890": 3.60969, - "3895": 3.5558, - "3900": 3.61685, - "3905": 3.59135, - "3910": 3.5772, - "3915": 3.5862, - "3920": 3.57131, - "3925": 3.56751, - "3930": 3.58005, - "3935": 3.5821, - "3940": 3.57511, - "3945": 3.56965, - "3950": 3.61887, - "3955": 3.57531, - "3960": 3.60735, - "3965": 3.58853, - "3970": 3.56735, - "3975": 3.56709, - "3980": 3.5304, - "3985": 3.60527, - "3990": 3.58124, - "3995": 3.60753, - "4000": 3.55811, - "4005": 3.54162, - "4010": 3.58376, - "4015": 3.58398, - "4020": 3.58355, - "4025": 3.57409, - "4030": 3.62855, - "4035": 3.57033, - "4040": 3.5882, - "4045": 3.60161, - "4050": 3.57522, - "4055": 3.57403, - "4060": 3.5888, - "4065": 3.58382, - "4070": 3.51488, - "4075": 3.55887, - "4080": 3.53108, - "4085": 3.54596, - "4090": 3.54584, - "4095": 3.53161, - "4100": 3.55106, - "4105": 3.53794, - "4110": 3.51736, - "4115": 3.56348, - "4120": 3.49648, - "4125": 3.49769, - "4130": 3.55149, - "4135": 3.54373, - "4140": 3.49112, - "4145": 3.51351, - "4150": 3.55497, - "4155": 3.48797, - "4160": 3.54539, - "4165": 3.56451, - "4170": 3.50424, - "4175": 3.50239, - "4180": 3.4998, - "4185": 3.5138, - "4190": 3.5011, - "4195": 3.50044, - "4200": 3.49424, - "4205": 3.53032, - "4210": 3.51921, - "4215": 3.52292, - "4220": 3.53088, - "4225": 3.50168, - "4230": 3.49756, - "4235": 3.52008, - "4240": 3.49249, - "4245": 3.49542, - "4250": 3.48848, - "4255": 3.50707, - "4260": 3.4676, - "4265": 3.48819, - "4270": 3.50473, - "4275": 3.53933, - "4280": 3.48997, - "4285": 3.50947, - "4290": 3.48405, - "4295": 3.48692, - "4300": 3.52631, - "4305": 3.48704, - "4310": 3.51358, - "4315": 3.50638, - "4320": 3.50379, - "4325": 3.51699, - "4330": 3.45992, - "4335": 3.49232, - "4340": 3.50354, - "4345": 3.43189, - "4350": 3.44845, - "4355": 3.52327, - "4360": 3.48083, - "4365": 3.47079, - "4370": 3.47624, - "4375": 3.44129, - "4380": 3.44296, - "4385": 3.42527, - "4390": 3.49048, - "4395": 3.47699, - "4400": 3.47442, - "4405": 3.41723, - "4410": 3.48335, - "4415": 3.44899, - "4420": 3.44113, - "4425": 3.47273, - "4430": 3.44742, - "4435": 3.49082, - "4440": 3.48522, - "4445": 3.43744, - "4450": 3.3974, - "4455": 3.4624, - "4460": 3.43415, - "4465": 3.45284, - "4470": 3.42199, - "4475": 3.45352, - "4480": 3.44375, - "4485": 3.43643, - "4490": 3.43453, - "4495": 3.38677, - "4500": 3.45384, - "4505": 3.43515, - "4510": 3.44292, - "4515": 3.40605, - "4520": 3.43888, - "4525": 3.40731, - "4530": 3.44131, - "4535": 3.3963, - "4540": 3.42067, - "4545": 3.43217, - "4550": 3.47418, - "4555": 3.39854, - "4560": 3.42732, - "4565": 3.37837, - "4570": 3.41702, - "4575": 3.41117, - "4580": 3.45362, - "4585": 3.42636, - "4590": 3.42388, - "4595": 3.39853, - "4600": 3.39686, - "4605": 3.42144, - "4610": 3.41286, - "4615": 3.45309, - "4620": 3.39526, - "4625": 3.42534, - "4630": 3.4127, - "4635": 3.39195, - "4640": 3.4264, - "4645": 3.41975, - "4650": 3.43542, - "4655": 3.40687, - "4660": 3.39737, - "4665": 3.41231, - "4670": 3.446, - "4675": 3.40423, - "4680": 3.42886, - "4685": 3.42464, - "4690": 3.39897, - "4695": 3.38, - "4700": 3.3729, - "4705": 3.35029, - "4710": 3.40571, - "4715": 3.39222, - "4720": 3.38774, - "4725": 3.35968, - "4730": 3.39519, - "4735": 3.32069, - "4740": 3.36458, - "4745": 3.40698, - "4750": 3.36053, - "4755": 3.39053, - "4760": 3.41421, - "4765": 3.36022, - "4770": 3.36502, - "4775": 3.36135, - "4780": 3.37362, - "4785": 3.374, - "4790": 3.41163, - "4795": 3.39334, - "4800": 3.34583, - "4805": 3.41139, - "4810": 3.35086, - "4815": 3.38903, - "4820": 3.34814, - "4825": 3.40406, - "4830": 3.38314, - "4835": 3.3693, - "4840": 3.38086, - "4845": 3.32726, - "4850": 3.39372, - "4855": 3.39679, - "4860": 3.32727, - "4865": 3.36392, - "4870": 3.34896, - "4875": 3.39123, - "4880": 3.39974, - "4885": 3.35153, - "4890": 3.36191, - "4895": 3.35318, - "4900": 3.32971, - "4905": 3.33008, - "4910": 3.32861, - "4915": 3.37524, - "4920": 3.35807, - "4925": 3.31242, - "4930": 3.34376, - "4935": 3.3273, - "4940": 3.28784, - "4945": 3.36034, - "4950": 3.29629, - "4955": 3.40365, - "4960": 3.3479, - "4965": 3.34204, - "4970": 3.33369, - "4975": 3.34388, - "4980": 3.36573, - "4985": 3.35352, - "4990": 3.33542, - "4995": 3.3795, - "5000": 3.30893, - "5005": 3.35715, - "5010": 3.36146, - "5015": 3.30923, - "5020": 3.28653, - "5025": 3.31605, - "5030": 3.32648, - "5035": 3.32963, - "5040": 3.30481, - "5045": 3.34994, - "5050": 3.30693, - "5055": 3.32632, - "5060": 3.28843, - "5065": 3.33396, - "5070": 3.33431, - "5075": 3.34337, - "5080": 3.31868, - "5085": 3.34518, - "5090": 3.32323, - "5095": 3.29022, - "5100": 3.32026, - "5105": 3.32744, - "5110": 3.3329, - "5115": 3.3038, - "5120": 3.34196, - "5125": 3.3184, - "5130": 3.31738, - "5135": 3.30105, - "5140": 3.3111, - "5145": 3.31125, - "5150": 3.32063, - "5155": 3.31567, - "5160": 3.31039, - "5165": 3.34534, - "5170": 3.23105, - "5175": 3.31877, - "5180": 3.28445, - "5185": 3.30691, - "5190": 3.32611, - "5195": 3.30561, - "5200": 3.31019, - "5205": 3.34654, - "5210": 3.28506, - "5215": 3.2874, - "5220": 3.28219, - "5225": 3.28677, - "5230": 3.32011, - "5235": 3.27975, - "5240": 3.27349, - "5245": 3.29646, - "5250": 3.3023, - "5255": 3.28615, - "5260": 3.31039, - "5265": 3.27007, - "5270": 3.25412, - "5275": 3.25534, - "5280": 3.28407, - "5285": 3.30874, - "5290": 3.2589, - "5295": 3.27448, - "5300": 3.27858, - "5305": 3.26656, - "5310": 3.32809, - "5315": 3.25873, - "5320": 3.30633, - "5325": 3.3111, - "5330": 3.27899, - "5335": 3.28833, - "5340": 3.23016, - "5345": 3.28336, - "5350": 3.28737, - "5355": 3.28737, - "5360": 3.23407, - "5365": 3.25011, - "5370": 3.28855, - "5375": 3.26985, - "5380": 3.24418, - "5385": 3.28394, - "5390": 3.28221, - "5395": 3.20448, - "5400": 3.30114, - "5405": 3.21525, - "5410": 3.29188, - "5415": 3.22284, - "5420": 3.25707, - "5425": 3.23689, - "5430": 3.24779, - "5435": 3.2811, - "5440": 3.21236, - "5445": 3.24176, - "5450": 3.24576, - "5455": 3.22991, - "5460": 3.25196, - "5465": 3.29692, - "5470": 3.27194, - "5475": 3.20136, - "5480": 3.28214, - "5485": 3.24325, - "5490": 3.26633, - "5495": 3.27183, - "5500": 3.22718, - "5505": 3.23914, - "5510": 3.28342, - "5515": 3.27035, - "5520": 3.23742, - "5525": 3.28473, - "5530": 3.22923, - "5535": 3.26258, - "5540": 3.25366, - "5545": 3.26198, - "5550": 3.24962, - "5555": 3.22875, - "5560": 3.22306, - "5565": 3.26845, - "5570": 3.22989, - "5575": 3.26435, - "5580": 3.23553, - "5585": 3.18594, - "5590": 3.24664, - "5595": 3.2105, - "5600": 3.25488, - "5605": 3.17461, - "5610": 3.2604, - "5615": 3.25606, - "5620": 3.2609, - "5625": 3.25214, - "5630": 3.24091, - "5635": 3.21924, - "5640": 3.24377, - "5645": 3.20743, - "5650": 3.2076, - "5655": 3.20542, - "5660": 3.20971, - "5665": 3.21069, - "5670": 3.20056, - "5675": 3.22863, - "5680": 3.19922, - "5685": 3.20573, - "5690": 3.2077, - "5695": 3.24414, - "5700": 3.19628, - "5705": 3.18515, - "5710": 3.17855, - "5715": 3.28582, - "5720": 3.2496, - "5725": 3.2002, - "5730": 3.24085, - "5735": 3.22905, - "5740": 3.22477, - "5745": 3.20281, - "5750": 3.23329, - "5755": 3.23832, - "5760": 3.22288, - "5765": 3.22651, - "5770": 3.25303, - "5775": 3.19712, - "5780": 3.21565, - "5785": 3.21756, - "5790": 3.22715, - "5795": 3.22463, - "5800": 3.16888, - "5805": 3.18332, - "5810": 3.22432, - "5815": 3.20302, - "5820": 3.16241, - "5825": 3.20754, - "5830": 3.1647, - "5835": 3.17395, - "5840": 3.20628, - "5845": 3.217, - "5850": 3.21594, - "5855": 3.15148, - "5860": 3.17119, - "5865": 3.20009, - "5870": 3.16136, - "5875": 3.20014, - "5880": 3.19456, - "5885": 3.19488, - "5890": 3.21776, - "5895": 3.23301, - "5900": 3.1895, - "5905": 3.21986, - "5910": 3.20185, - "5915": 3.17464, - "5920": 3.1915, - "5925": 3.15681, - "5930": 3.19135, - "5935": 3.19128, - "5940": 3.2051, - "5945": 3.21968, - "5950": 3.20213, - "5955": 3.16275, - "5960": 3.22598, - "5965": 3.17666, - "5970": 3.21828, - "5975": 3.18539, - "5980": 3.25556, - "5985": 3.14035, - "5990": 3.2373, - "5995": 3.15341, - "6000": 3.17562, - "6005": 3.15642, - "6010": 3.15958, - "6015": 3.16383, - "6020": 3.17057, - "6025": 3.20846, - "6030": 3.14683, - "6035": 3.20108, - "6040": 3.18034, - "6045": 3.19784, - "6050": 3.19841, - "6055": 3.17123, - "6060": 3.18513, - "6065": 3.20946, - "6070": 3.16514, - "6075": 3.13204, - "6080": 3.19182, - "6085": 3.15022, - "6090": 3.18799, - "6095": 3.18454, - "6100": 3.13968, - "6105": 3.18911, - "6110": 3.13194, - "6115": 3.18032, - "6120": 3.17268, - "6125": 3.17817, - "6130": 3.16826, - "6135": 3.16641, - "6140": 3.16491, - "6145": 3.14203, - "6150": 3.17849, - "6155": 3.14973, - "6160": 3.12836, - "6165": 3.15943, - "6170": 3.14366, - "6175": 3.14619, - "6180": 3.14564, - "6185": 3.18694, - "6190": 3.15491, - "6195": 3.12582, - "6200": 3.15218, - "6205": 3.14598, - "6210": 3.10092, - "6215": 3.15518, - "6220": 3.1544, - "6225": 3.17142, - "6230": 3.10668, - "6235": 3.14063, - "6240": 3.08394, - "6245": 3.18223, - "6250": 3.14309, - "6255": 3.15773, - "6260": 3.14125, - "6265": 3.15597, - "6270": 3.10065, - "6275": 3.12382, - "6280": 3.13503, - "6285": 3.11829, - "6290": 3.14415, - "6295": 3.15298, - "6300": 3.15403, - "6305": 3.21086, - "6310": 3.11266, - "6315": 3.10982, - "6320": 3.16047, - "6325": 3.10246, - "6330": 3.16954, - "6335": 3.15391, - "6340": 3.10904, - "6345": 3.16578, - "6350": 3.11808, - "6355": 3.11742, - "6360": 3.1108, - "6365": 3.14775, - "6370": 3.16278, - "6375": 3.1337, - "6380": 3.15125, - "6385": 3.17081, - "6390": 3.12597, - "6395": 3.10466, - "6400": 3.10591, - "6405": 3.18617, - "6410": 3.17298, - "6415": 3.12537, - "6420": 3.17096, - "6425": 3.17458, - "6430": 3.16659, - "6435": 3.12451, - "6440": 3.13606, - "6445": 3.15196, - "6450": 3.09161, - "6455": 3.08666, - "6460": 3.13082, - "6465": 3.16786, - "6470": 3.13951, - "6475": 3.13285, - "6480": 3.15191, - "6485": 3.11206, - "6490": 3.0797, - "6495": 3.16564, - "6500": 3.14177, - "6505": 3.08566, - "6510": 3.14483, - "6515": 3.16369, - "6520": 3.09044, - "6525": 3.14867, - "6530": 3.10896, - "6535": 3.12403, - "6540": 3.18005, - "6545": 3.11404, - "6550": 3.11103, - "6555": 3.10947, - "6560": 3.0737, - "6565": 3.07934, - "6570": 3.10438, - "6575": 3.05844, - "6580": 3.17411, - "6585": 3.10694, - "6590": 3.0877, - "6595": 3.10332, - "6600": 3.1032, - "6605": 3.08625, - "6610": 3.08405, - "6615": 3.1316, - "6620": 3.076, - "6625": 3.09705, - "6630": 3.09309, - "6635": 3.12933, - "6640": 3.08864, - "6645": 3.10948, - "6650": 3.1378, - "6655": 3.07416, - "6660": 3.11313, - "6665": 3.12487, - "6670": 3.08048, - "6675": 3.10457, - "6680": 3.10673, - "6685": 3.14077, - "6690": 3.11651, - "6695": 3.12176, - "6700": 3.1127, - "6705": 3.09107, - "6710": 3.10728, - "6715": 3.05842, - "6720": 3.13504, - "6725": 3.12621, - "6730": 3.1099, - "6735": 3.10898, - "6740": 3.11731, - "6745": 3.0901, - "6750": 3.10983, - "6755": 3.06749, - "6760": 3.06624, - "6765": 3.08509, - "6770": 3.07057, - "6775": 3.10523, - "6780": 3.07455, - "6785": 3.07959, - "6790": 3.10472, - "6795": 3.07166, - "6800": 3.09692, - "6805": 3.08719, - "6810": 3.10858, - "6815": 3.04354, - "6820": 3.07401, - "6825": 3.10257, - "6830": 3.08637, - "6835": 3.06002, - "6840": 3.0654, - "6845": 3.11054, - "6850": 3.08009, - "6855": 3.11065, - "6860": 3.06305, - "6865": 3.10876, - "6870": 3.07538, - "6875": 3.07578, - "6880": 3.08642, - "6885": 3.05135, - "6890": 3.0749, - "6895": 3.05299, - "6900": 3.05973, - "6905": 3.07506, - "6910": 3.09159, - "6915": 3.11333, - "6920": 3.06615, - "6925": 3.08379, - "6930": 3.06742, - "6935": 3.02485, - "6940": 3.06623, - "6945": 3.05639, - "6950": 3.07964, - "6955": 3.05853, - "6960": 3.05554, - "6965": 3.09907, - "6970": 3.03589, - "6975": 3.1075, - "6980": 3.06776, - "6985": 3.06784, - "6990": 3.11146, - "6995": 3.09126, - "7000": 3.02783, - "7005": 3.09757, - "7010": 3.0779, - "7015": 3.07385, - "7020": 3.10018, - "7025": 3.08417, - "7030": 3.08746, - "7035": 3.04096, - "7040": 3.01984, - "7045": 3.07968, - "7050": 3.09817, - "7055": 3.03816, - "7060": 3.09848, - "7065": 3.11109, - "7070": 3.05748, - "7075": 3.06319, - "7080": 3.11208, - "7085": 3.03557, - "7090": 3.05692, - "7095": 3.04652, - "7100": 3.07149, - "7105": 3.02035, - "7110": 3.0623, - "7115": 3.03547, - "7120": 3.07999, - "7125": 3.03377, - "7130": 3.04883, - "7135": 3.05627, - "7140": 3.06014, - "7145": 3.0691, - "7150": 3.02375, - "7155": 3.08612, - "7160": 3.0047, - "7165": 3.0418, - "7170": 3.07701, - "7175": 3.03661, - "7180": 3.07042, - "7185": 3.09125, - "7190": 3.05302, - "7195": 3.06058, - "7200": 3.06039, - "7205": 3.04153, - "7210": 3.08703, - "7215": 3.06723, - "7220": 3.08798, - "7225": 3.06993, - "7230": 3.07403, - "7235": 3.05435, - "7240": 3.05017, - "7245": 3.07131, - "7250": 3.01274, - "7255": 3.03229, - "7260": 3.06928, - "7265": 3.00261, - "7270": 3.04138, - "7275": 3.04223, - "7280": 3.04181, - "7285": 3.05407, - "7290": 3.07344, - "7295": 3.06537, - "7300": 3.02809, - "7305": 3.02877, - "7310": 3.04926, - "7315": 3.07646, - "7320": 3.05669, - "7325": 3.06149, - "7330": 3.02592, - "7335": 3.02733, - "7340": 3.06004, - "7345": 3.0091, - "7350": 3.06031, - "7355": 3.04495, - "7360": 3.03923, - "7365": 3.03845, - "7370": 3.03136, - "7375": 2.9999, - "7380": 3.06202, - "7385": 3.07693, - "7390": 3.06411, - "7395": 3.02221, - "7400": 3.07516, - "7405": 3.04382, - "7410": 3.06023, - "7415": 3.05228, - "7420": 3.03261, - "7425": 3.08586, - "7430": 3.0272, - "7435": 3.01757, - "7440": 3.0377, - "7445": 3.01394, - "7450": 2.99482, - "7455": 3.04735, - "7460": 3.04105, - "7465": 3.04977, - "7470": 3.05673, - "7475": 3.06741, - "7480": 3.02749, - "7485": 2.98653, - "7490": 2.98973, - "7495": 2.99863, - "7500": 3.02945, - "7505": 3.0059, - "7510": 2.97871, - "7515": 3.02404, - "7520": 3.01697, - "7525": 2.98295, - "7530": 3.02636, - "7535": 3.04423, - "7540": 3.02494, - "7545": 3.0588, - "7550": 3.06534, - "7555": 3.00732, - "7560": 3.01283, - "7565": 3.00874, - "7570": 3.03442, - "7575": 2.97962, - "7580": 3.03034, - "7585": 3.01793, - "7590": 3.01504, - "7595": 3.07403, - "7600": 3.03015, - "7605": 3.02144, - "7610": 3.00533, - "7615": 2.99602, - "7620": 2.99265, - "7625": 3.03762, - "7630": 3.02026, - "7635": 3.01854, - "7640": 3.01712, - "7645": 3.04845, - "7650": 3.04439, - "7655": 3.08975, - "7660": 2.96325, - "7665": 3.02969, - "7670": 3.01245, - "7675": 3.00305, - "7680": 2.9998, - "7685": 3.07016, - "7690": 3.01368, - "7695": 2.99671, - "7700": 3.05056, - "7705": 3.01282, - "7710": 3.05828, - "7715": 2.99725, - "7720": 3.08276, - "7725": 2.98411, - "7730": 2.99881, - "7735": 3.02714, - "7740": 3.00979, - "7745": 3.00319, - "7750": 3.01, - "7755": 3.01954, - "7760": 2.98571, - "7765": 3.00397, - "7770": 3.02732, - "7775": 2.98978, - "7780": 2.97862, - "7785": 3.01472, - "7790": 2.99842, - "7795": 3.02413, - "7800": 3.00827, - "7805": 3.01176, - "7810": 3.03082, - "7815": 3.00244, - "7820": 3.0019, - "7825": 3.03231, - "7830": 3.03143, - "7835": 2.96605, - "7840": 3.04336, - "7845": 2.97937, - "7850": 2.93977, - "7855": 2.98529, - "7860": 2.98344, - "7865": 3.02956, - "7870": 2.9691, - "7875": 2.98838, - "7880": 3.00349, - "7885": 2.9968, - "7890": 3.03811, - "7895": 3.02857, - "7900": 3.03097, - "7905": 2.99876, - "7910": 3.0088, - "7915": 3.02527, - "7920": 3.01259, - "7925": 2.99646, - "7930": 3.02866, - "7935": 2.98913, - "7940": 3.03573, - "7945": 3.0501, - "7950": 2.96381, - "7955": 2.98711, - "7960": 2.96943, - "7965": 2.94566, - "7970": 2.9655, - "7975": 2.99544, - "7980": 3.00887, - "7985": 2.97698, - "7990": 2.97506, - "7995": 2.96124, - "8000": 3.02098, - "8005": 2.9801, - "8010": 2.97649, - "8015": 2.96466, - "8020": 2.97779, - "8025": 2.95601, - "8030": 2.97562, - "8035": 2.97196, - "8040": 2.95703, - "8045": 3.01604, - "8050": 3.01297, - "8055": 2.97453, - "8060": 3.00494, - "8065": 2.98862, - "8070": 2.96753, - "8075": 2.97734, - "8080": 3.01019, - "8085": 2.96754, - "8090": 2.98003, - "8095": 3.00216, - "8100": 2.95105, - "8105": 2.99247, - "8110": 2.98157, - "8115": 2.95999, - "8120": 2.97249, - "8125": 2.99946, - "8130": 2.97003, - "8135": 2.98766, - "8140": 2.96736, - "8145": 2.95939, - "8150": 2.98009, - "8155": 2.95146, - "8160": 2.997, - "8165": 2.9913, - "8170": 2.95554, - "8175": 2.95554, - "8180": 3.01376, - "8185": 2.98624, - "8190": 3.02032, - "8195": 2.99613, - "8200": 2.96412, - "8205": 2.97566, - "8210": 2.9781, - "8215": 2.99017, - "8220": 2.971, - "8225": 2.96329, - "8230": 2.99505, - "8235": 3.00306, - "8240": 2.97419, - "8245": 2.9738, - "8250": 3.00958, - "8255": 2.96716, - "8260": 2.97331, - "8265": 2.95555, - "8270": 2.97514, - "8275": 2.96718, - "8280": 2.94092, - "8285": 2.97838, - "8290": 2.96734, - "8295": 2.95246, - "8300": 2.96504, - "8305": 2.97504, - "8310": 2.97996, - "8315": 2.95732, - "8320": 2.97776, - "8325": 2.929, - "8330": 2.89908, - "8335": 2.96646, - "8340": 2.99201, - "8345": 2.94463, - "8350": 2.95886, - "8355": 2.98631, - "8360": 2.96643, - "8365": 2.98326, - "8370": 2.99094, - "8375": 2.93854, - "8380": 2.94099, - "8385": 2.97126, - "8390": 2.9453, - "8395": 2.97523, - "8400": 2.95927, - "8405": 2.97418, - "8410": 3.03057, - "8415": 2.93533, - "8420": 2.91801, - "8425": 2.97564, - "8430": 2.97808, - "8435": 2.93124, - "8440": 3.01239, - "8445": 2.99121, - "8450": 2.96616, - "8455": 2.97106, - "8460": 2.97975, - "8465": 2.92562, - "8470": 2.94697, - "8475": 2.99054, - "8480": 2.93097, - "8485": 2.93977, - "8490": 2.948, - "8495": 2.93336, - "8500": 2.96904, - "8505": 2.92233, - "8510": 3.00332, - "8515": 2.94052, - "8520": 2.95755, - "8525": 2.88522, - "8530": 2.95834, - "8535": 2.97603, - "8540": 2.93194, - "8545": 2.95741, - "8550": 2.92307, - "8555": 2.98961, - "8560": 2.99424, - "8565": 2.9514, - "8570": 2.94707, - "8575": 2.93509, - "8580": 2.9669, - "8585": 2.976, - "8590": 2.97659, - "8595": 2.97731, - "8600": 2.94787, - "8605": 2.94545, - "8610": 2.95479, - "8615": 2.96032, - "8620": 2.92346, - "8625": 2.94581, - "8630": 2.95087, - "8635": 2.94522, - "8640": 2.92578, - "8645": 2.98133, - "8650": 2.92232, - "8655": 2.96592, - "8660": 2.97073, - "8665": 2.95471, - "8670": 2.96657, - "8675": 2.93996, - "8680": 2.93576, - "8685": 2.94815, - "8690": 2.96442, - "8695": 2.97067, - "8700": 2.94799, - "8705": 2.91745, - "8710": 2.96979, - "8715": 2.91522, - "8720": 2.97447, - "8725": 2.94876, - "8730": 2.94256, - "8735": 2.97158, - "8740": 2.92587, - "8745": 2.96492, - "8750": 2.96628, - "8755": 2.93098, - "8760": 2.94924, - "8765": 2.91354, - "8770": 2.96822, - "8775": 2.94219, - "8780": 2.92859, - "8785": 2.94726, - "8790": 2.92803, - "8795": 2.96489, - "8800": 2.92662, - "8805": 2.90115, - "8810": 2.93145, - "8815": 2.93283, - "8820": 2.90387, - "8825": 2.92443, - "8830": 2.91245, - "8835": 2.89847, - "8840": 2.91518, - "8845": 2.92785, - "8850": 2.95695, - "8855": 2.92839, - "8860": 2.98878, - "8865": 2.93356, - "8870": 2.90865, - "8875": 2.92162, - "8880": 2.9295, - "8885": 2.9207, - "8890": 2.9404, - "8895": 2.92179, - "8900": 2.94464, - "8905": 2.93594, - "8910": 2.91993, - "8915": 2.90336, - "8920": 2.91127, - "8925": 2.97428, - "8930": 2.96209, - "8935": 2.97189, - "8940": 2.94882, - "8945": 2.94789, - "8950": 2.9328, - "8955": 2.91679, - "8960": 2.89858, - "8965": 2.92721, - "8970": 2.94082, - "8975": 2.90449, - "8980": 2.89797, - "8985": 2.92102, - "8990": 2.9662, - "8995": 2.9373, - "9000": 2.89467, - "9005": 2.9399, - "9010": 2.97901, - "9015": 2.90311, - "9020": 2.90423, - "9025": 2.92238, - "9030": 2.94518, - "9035": 2.85736, - "9040": 2.93491, - "9045": 2.92378, - "9050": 2.96087, - "9055": 2.88884, - "9060": 2.95609, - "9065": 2.98682, - "9070": 2.92665, - "9075": 2.94254, - "9080": 2.93301, - "9085": 2.9439, - "9090": 2.93648, - "9095": 2.89849, - "9100": 2.90017, - "9105": 2.89, - "9110": 2.93211, - "9115": 2.93981, - "9120": 2.97397, - "9125": 2.91648, - "9130": 2.92277, - "9135": 2.94086, - "9140": 2.94695, - "9145": 2.89447, - "9150": 2.92217, - "9155": 2.93169, - "9160": 2.93686, - "9165": 2.92557, - "9170": 2.9498, - "9175": 2.88716, - "9180": 2.93307, - "9185": 2.8947, - "9190": 2.94894, - "9195": 2.91222, - "9200": 2.93251, - "9205": 2.88702, - "9210": 2.93304, - "9215": 2.87965, - "9220": 2.90288, - "9225": 2.93315, - "9230": 2.86569, - "9235": 2.87842, - "9240": 2.89576, - "9245": 2.88279, - "9250": 2.88136, - "9255": 2.91192, - "9260": 2.87817, - "9265": 2.92175, - "9270": 2.89613, - "9275": 2.91313, - "9280": 2.91939, - "9285": 2.91903, - "9290": 2.93047, - "9295": 2.92844, - "9300": 2.87877, - "9305": 2.90909, - "9310": 2.89871, - "9315": 2.86609, - "9320": 2.86065, - "9325": 2.90436, - "9330": 2.95511, - "9335": 2.87572, - "9340": 2.93845, - "9345": 2.94693, - "9350": 2.9134, - "9355": 2.87737, - "9360": 2.89674, - "9365": 2.8823, - "9370": 2.93386, - "9375": 2.91236, - "9380": 2.86428, - "9385": 2.91358, - "9390": 2.92324, - "9395": 2.92024, - "9400": 2.89599, - "9405": 2.89197, - "9410": 2.9185, - "9415": 2.91775, - "9420": 2.89381, - "9425": 2.89983, - "9430": 2.87833, - "9435": 2.90417, - "9440": 2.89629, - "9445": 2.88366, - "9450": 2.89069, - "9455": 2.88969, - "9460": 2.94442, - "9465": 2.94721, - "9470": 2.88553, - "9475": 2.94033, - "9480": 2.88982, - "9485": 2.87815, - "9490": 2.89723, - "9495": 2.9225, - "9500": 2.89514, - "9505": 2.86794, - "9510": 2.894, - "9515": 2.90369, - "9520": 2.91102, - "9525": 2.89095, - "9530": 2.88696, - "9535": 2.91216 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 1021640256.0, - "5": 1024063424.0, - "10": 1014250560.0, - "15": 1024077504.0, - "20": 1022486144.0, - "25": 1041373312.0, - "30": 1028112896.0, - "35": 1035625088.0, - "40": 1026328384.0, - "45": 1022350080.0, - "50": 1030098560.0, - "55": 1028966144.0, - "60": 1036320640.0, - "65": 1034679168.0, - "70": 1029374848.0, - "75": 1028745088.0, - "80": 1047575040.0, - "85": 1029448064.0, - "90": 1020467392.0, - "95": 1028310016.0, - "100": 1040961344.0, - "105": 1039436544.0, - "110": 1026879104.0, - "115": 1052312832.0, - "120": 1018863104.0, - "125": 1045372160.0, - "130": 1034330368.0, - "135": 1016615680.0, - "140": 1038582272.0, - "145": 1020688640.0, - "150": 1039788096.0, - "155": 1032796928.0, - "160": 1020952640.0, - "165": 1032424512.0, - "170": 1017396096.0, - "175": 1033427072.0, - "180": 1036119424.0, - "185": 1030573760.0, - "190": 1035673984.0, - "195": 1034555520.0, - "200": 1040973824.0, - "205": 1048500352.0, - "210": 1054481024.0, - "215": 1025159552.0, - "220": 1044962496.0, - "225": 1038076416.0, - "230": 1026222720.0, - "235": 1051134976.0, - "240": 1029276416.0, - "245": 1031397824.0, - "250": 1027879616.0, - "255": 1016929792.0, - "260": 1045008896.0, - "265": 1021330688.0, - "270": 1030964864.0, - "275": 1036911744.0, - "280": 1031743488.0, - "285": 1015014016.0, - "290": 1018756352.0, - "295": 1017237504.0, - "300": 1034761152.0, - "305": 1032166144.0, - "310": 1035583104.0, - "315": 1012734272.0, - "320": 1008275072.0, - "325": 1042741760.0, - "330": 1042870656.0, - "335": 1033508480.0, - "340": 1014464512.0, - "345": 1042618880.0, - "350": 1031852736.0, - "355": 1050844800.0, - "360": 1030258432.0, - "365": 1034595648.0, - "370": 1019436032.0, - "375": 1022144832.0, - "380": 1021326592.0, - "385": 1025589504.0, - "390": 1023195072.0, - "395": 1019653952.0, - "400": 1033520512.0, - "405": 1023880192.0, - "410": 1017910016.0, - "415": 1024288000.0, - "420": 1020624256.0, - "425": 1025854848.0, - "430": 1033854336.0, - "435": 1028182400.0, - "440": 1022090752.0, - "445": 1036768256.0, - "450": 1024997376.0, - "455": 1013852096.0, - "460": 1022093824.0, - "465": 1041431552.0, - "470": 1029038016.0, - "475": 1010065792.0, - "480": 1047607616.0, - "485": 1029724928.0, - "490": 1044668160.0, - "495": 1025229952.0, - "500": 1037464960.0, - "505": 1032181376.0, - "510": 1042853056.0, - "515": 1026159744.0, - "520": 1013409792.0, - "525": 1035147520.0, - "530": 1016375552.0, - "535": 1040113024.0, - "540": 1035052352.0, - "545": 1032113664.0, - "550": 1018673408.0, - "555": 1008638656.0, - "560": 1011927680.0, - "565": 1041824320.0, - "570": 1034942208.0, - "575": 1010199040.0, - "580": 1032210496.0, - "585": 1041262144.0, - "590": 1038867968.0, - "595": 1035743104.0, - "600": 1023772736.0, - "605": 1032294272.0, - "610": 1037748672.0, - "615": 1005974784.0, - "620": 1040407424.0, - "625": 1045209216.0, - "630": 1034414464.0, - "635": 1028523008.0, - "640": 1022644928.0, - "645": 1035876032.0, - "650": 1009255680.0, - "655": 997757696.0, - "660": 1029710464.0, - "665": 1025532608.0, - "670": 1048812288.0, - "675": 1025202688.0, - "680": 1019340032.0, - "685": 1027832512.0, - "690": 1029230080.0, - "695": 1040024576.0, - "700": 1042031680.0, - "705": 1034382976.0, - "710": 1020441792.0, - "715": 1031472128.0, - "720": 1040274560.0, - "725": 1023279936.0, - "730": 1022792704.0, - "735": 1025085696.0, - "740": 1038382656.0, - "745": 1045205504.0, - "750": 1013180928.0, - "755": 1031644032.0, - "760": 1032783552.0, - "765": 1027135936.0, - "770": 1023967232.0, - "775": 1025895168.0, - "780": 1038166464.0, - "785": 1025486400.0, - "790": 1040810624.0, - "795": 1032531200.0, - "800": 1039592768.0, - "805": 1024318016.0, - "810": 1034725632.0, - "815": 1036000448.0, - "820": 1035671552.0, - "825": 1051375360.0, - "830": 1035406784.0, - "835": 1022547776.0, - "840": 1036875648.0, - "845": 1025700352.0, - "850": 1048529920.0, - "855": 1014986432.0, - "860": 1033098624.0, - "865": 1031543040.0, - "870": 1040902912.0, - "875": 1023938304.0, - "880": 1028395904.0, - "885": 1054406656.0, - "890": 1019537152.0, - "895": 1045189824.0, - "900": 1031772928.0, - "905": 1020970688.0, - "910": 1031386112.0, - "915": 1032926912.0, - "920": 1038459392.0, - "925": 1026754560.0, - "930": 1025378752.0, - "935": 1031126464.0, - "940": 1057933568.0, - "945": 1029823104.0, - "950": 1014412480.0, - "955": 1032173696.0, - "960": 1026152064.0, - "965": 1062678976.0, - "970": 1030096128.0, - "975": 1036903680.0, - "980": 1027049216.0, - "985": 1030676736.0, - "990": 1020676864.0, - "995": 1042301760.0, - "1000": 1036831616.0, - "1005": 1050206080.0, - "1010": 1023801984.0, - "1015": 1020539008.0, - "1020": 1042587392.0, - "1025": 1037943808.0, - "1030": 1049210048.0, - "1035": 1012483456.0, - "1040": 1023092032.0, - "1045": 1039520768.0, - "1050": 1026825728.0, - "1055": 1034861184.0, - "1060": 1046128704.0, - "1065": 1036804096.0, - "1070": 1019994880.0, - "1075": 1025341696.0, - "1080": 1014979200.0, - "1085": 1030007744.0, - "1090": 1029062016.0, - "1095": 1020309888.0, - "1100": 1039835008.0, - "1105": 1048600064.0, - "1110": 1020704448.0, - "1115": 1024782720.0, - "1120": 1061896576.0, - "1125": 1043311616.0, - "1130": 1031219456.0, - "1135": 1041360512.0, - "1140": 1021486272.0, - "1145": 1051696128.0, - "1150": 1035590400.0, - "1155": 1029590528.0, - "1160": 1042564800.0, - "1165": 1026810496.0, - "1170": 1018001408.0, - "1175": 1033684032.0, - "1180": 1035633536.0, - "1185": 1023928960.0, - "1190": 1033160320.0, - "1195": 1024228608.0, - "1200": 1039116544.0, - "1205": 1031740800.0, - "1210": 1053250560.0, - "1215": 1024617600.0, - "1220": 1009041280.0, - "1225": 1036679680.0, - "1230": 1041257984.0, - "1235": 1053974912.0, - "1240": 1030356224.0, - "1245": 1017684864.0, - "1250": 1022772992.0, - "1255": 1033439104.0, - "1260": 1034284736.0, - "1265": 1034003840.0, - "1270": 1037323264.0, - "1275": 1029345792.0, - "1280": 1046489856.0, - "1285": 1028285120.0, - "1290": 1036578176.0, - "1295": 1032421696.0, - "1300": 1033065728.0, - "1305": 1030027008.0, - "1310": 1051262976.0, - "1315": 1035373184.0, - "1320": 1028263936.0, - "1325": 1049972736.0, - "1330": 1030133376.0, - "1335": 1031164800.0, - "1340": 1012758912.0, - "1345": 1044639232.0, - "1350": 1034957312.0, - "1355": 1033623744.0, - "1360": 1036683392.0, - "1365": 1038588672.0, - "1370": 1039851904.0, - "1375": 1034117632.0, - "1380": 1022886656.0, - "1385": 1018084096.0, - "1390": 1049054400.0, - "1395": 1034868352.0, - "1400": 1034998144.0, - "1405": 1034131456.0, - "1410": 1036368256.0, - "1415": 1043577600.0, - "1420": 1026111104.0, - "1425": 1033320320.0, - "1430": 1012808128.0, - "1435": 1038394880.0, - "1440": 1020971904.0, - "1445": 1032459904.0, - "1450": 1014039296.0, - "1455": 1011673984.0, - "1460": 1043275904.0, - "1465": 1014361600.0, - "1470": 1020655360.0, - "1475": 1030231296.0, - "1480": 1029370496.0, - "1485": 1022997696.0, - "1490": 1026783360.0, - "1495": 1021815744.0, - "1500": 1027177088.0, - "1505": 1034882880.0, - "1510": 1014397120.0, - "1515": 1042136832.0, - "1520": 1025792640.0, - "1525": 1036335872.0, - "1530": 1039948992.0, - "1535": 1047640192.0, - "1540": 1043539840.0, - "1545": 1034043520.0, - "1550": 1016108736.0, - "1555": 1015573504.0, - "1560": 1055021824.0, - "1565": 1015593728.0, - "1570": 1018243840.0, - "1575": 1032515456.0, - "1580": 1012984768.0, - "1585": 1025327680.0, - "1590": 1034127360.0, - "1595": 1057393664.0, - "1600": 1026867584.0, - "1605": 1019994624.0, - "1610": 1031268736.0, - "1615": 1035274880.0, - "1620": 1018016000.0, - "1625": 1028272512.0, - "1630": 1027205376.0, - "1635": 1023799040.0, - "1640": 1034120832.0, - "1645": 1021814528.0, - "1650": 1015262080.0, - "1655": 1018280064.0, - "1660": 1047982976.0, - "1665": 1027060352.0, - "1670": 1048219904.0, - "1675": 1021102912.0, - "1680": 1043288320.0, - "1685": 1052719360.0, - "1690": 1026724032.0, - "1695": 1040385280.0, - "1700": 1018036352.0, - "1705": 1020480640.0, - "1710": 1021024448.0, - "1715": 1026932992.0, - "1720": 1028350208.0, - "1725": 1034363136.0, - "1730": 1013692352.0, - "1735": 1018429696.0, - "1740": 1057257024.0, - "1745": 1029261952.0, - "1750": 1024357888.0, - "1755": 1029970112.0, - "1760": 1022192512.0, - "1765": 1040477056.0, - "1770": 1029669760.0, - "1775": 1046196864.0, - "1780": 1021955712.0, - "1785": 1035109376.0, - "1790": 1028263808.0, - "1795": 1031023616.0, - "1800": 1028300480.0, - "1805": 1025669248.0, - "1810": 1021556096.0, - "1815": 1033440256.0, - "1820": 1034885888.0, - "1825": 1020208448.0, - "1830": 1013885632.0, - "1835": 1031382272.0, - "1840": 1040391040.0, - "1845": 1034828800.0, - "1850": 1014480064.0, - "1855": 1019418816.0, - "1860": 1019569536.0, - "1865": 1035942400.0, - "1870": 1026242368.0, - "1875": 1031525248.0, - "1880": 1011590784.0, - "1885": 1041065536.0, - "1890": 1035000704.0, - "1895": 1028959488.0, - "1900": 1033997568.0, - "1905": 1027123776.0, - "1910": 1029217792.0, - "1915": 1030492864.0, - "1920": 1042920384.0, - "1925": 1038419392.0, - "1930": 1019304512.0, - "1935": 1032535936.0, - "1940": 1027806336.0, - "1945": 1034205056.0, - "1950": 1006036224.0, - "1955": 1032577600.0, - "1960": 1015720256.0, - "1965": 1029088512.0, - "1970": 1021554176.0, - "1975": 1034048000.0, - "1980": 1029366912.0, - "1985": 1027784960.0, - "1990": 1020947840.0, - "1995": 1010422912.0, - "2000": 1039617152.0, - "2005": 1001486208.0, - "2010": 1020422912.0, - "2015": 1032034048.0, - "2020": 1036298624.0, - "2025": 1037172352.0, - "2030": 1029770752.0, - "2035": 1040333312.0, - "2040": 1030112768.0, - "2045": 1032700800.0, - "2050": 1008016064.0, - "2055": 1045723840.0, - "2060": 1028142400.0, - "2065": 1038799488.0, - "2070": 1045645184.0, - "2075": 1035237952.0, - "2080": 1022882304.0, - "2085": 1024815424.0, - "2090": 1034363392.0, - "2095": 1005220672.0, - "2100": 1034644096.0, - "2105": 1035581312.0, - "2110": 1030685952.0, - "2115": 1029798528.0, - "2120": 1018846080.0, - "2125": 1021863168.0, - "2130": 1026638080.0, - "2135": 1053279488.0, - "2140": 1017060608.0, - "2145": 1019635072.0, - "2150": 1037130752.0, - "2155": 1033302784.0, - "2160": 1049035776.0, - "2165": 1039682816.0, - "2170": 1020308096.0, - "2175": 1027338752.0, - "2180": 1041703168.0, - "2185": 1028895360.0, - "2190": 1029309888.0, - "2195": 1028944768.0, - "2200": 1039639680.0, - "2205": 1036972288.0, - "2210": 1031740544.0, - "2215": 1021404480.0, - "2220": 1020910848.0, - "2225": 1033403072.0, - "2230": 1014201856.0, - "2235": 1029395968.0, - "2240": 1029885184.0, - "2245": 1026005824.0, - "2250": 1046268800.0, - "2255": 1032951936.0, - "2260": 1047494592.0, - "2265": 1023721088.0, - "2270": 1022566144.0, - "2275": 1028537600.0, - "2280": 1034973568.0, - "2285": 1031819968.0, - "2290": 1038650048.0, - "2295": 1028816000.0, - "2300": 1034450496.0, - "2305": 1032314496.0, - "2310": 1013586496.0, - "2315": 1048182656.0, - "2320": 1035210368.0, - "2325": 1046966016.0, - "2330": 1014696192.0, - "2335": 1027382272.0, - "2340": 1036736512.0, - "2345": 1020186944.0, - "2350": 1031017728.0, - "2355": 1037474240.0, - "2360": 1032608128.0, - "2365": 1028041856.0, - "2370": 1021004224.0, - "2375": 1022912000.0, - "2380": 1048556224.0, - "2385": 1044140736.0, - "2390": 1021986816.0, - "2395": 1020595584.0, - "2400": 1026930816.0, - "2405": 1038387200.0, - "2410": 1045395200.0, - "2415": 1048454656.0, - "2420": 1032227712.0, - "2425": 1029562176.0, - "2430": 1030386176.0, - "2435": 1029217856.0, - "2440": 1029168000.0, - "2445": 1033132160.0, - "2450": 1038557824.0, - "2455": 1034721536.0, - "2460": 1039984192.0, - "2465": 1032500992.0, - "2470": 1024143872.0, - "2475": 1016539520.0, - "2480": 1023613248.0, - "2485": 1021030592.0, - "2490": 1035920448.0, - "2495": 1032967360.0, - "2500": 1028107008.0, - "2505": 1015385600.0, - "2510": 1030967104.0, - "2515": 1025700096.0, - "2520": 1033326208.0, - "2525": 1029692800.0, - "2530": 1023986560.0, - "2535": 1071069696.0, - "2540": 1024537984.0, - "2545": 1033798784.0, - "2550": 1029448064.0, - "2555": 1029183488.0, - "2560": 1018115072.0, - "2565": 1031598528.0, - "2570": 1022847232.0, - "2575": 1026503104.0, - "2580": 1038622592.0, - "2585": 1025899456.0, - "2590": 1026100800.0, - "2595": 1046623104.0, - "2600": 1031103360.0, - "2605": 1001910656.0, - "2610": 1028423360.0, - "2615": 1025564544.0, - "2620": 1038651392.0, - "2625": 1026996352.0, - "2630": 1036831424.0, - "2635": 1021198400.0, - "2640": 1021865856.0, - "2645": 1039153408.0, - "2650": 1025943488.0, - "2655": 1013255808.0, - "2660": 1032645248.0, - "2665": 1035218048.0, - "2670": 1036437632.0, - "2675": 1039296064.0, - "2680": 1041661696.0, - "2685": 1034565504.0, - "2690": 1058871168.0, - "2695": 1019879552.0, - "2700": 1062626816.0, - "2705": 1035376320.0, - "2710": 1019542400.0, - "2715": 1031885824.0, - "2720": 1016403200.0, - "2725": 1040594688.0, - "2730": 1019586688.0, - "2735": 1030889856.0, - "2740": 1029290752.0, - "2745": 1040687744.0, - "2750": 1023880448.0, - "2755": 1011865664.0, - "2760": 1027684864.0, - "2765": 1030882240.0, - "2770": 1033119872.0, - "2775": 1026332352.0, - "2780": 1033684224.0, - "2785": 1024589888.0, - "2790": 1033734272.0, - "2795": 1045949184.0, - "2800": 1040286016.0, - "2805": 1019944192.0, - "2810": 1031449600.0, - "2815": 1030932736.0, - "2820": 1037855616.0, - "2825": 1041684096.0, - "2830": 1030459904.0, - "2835": 1013508352.0, - "2840": 1031449600.0, - "2845": 1030129920.0, - "2850": 1026617600.0, - "2855": 1024705280.0, - "2860": 1031700096.0, - "2865": 1027428800.0, - "2870": 1026690048.0, - "2875": 1012777024.0, - "2880": 1038301568.0, - "2885": 1017901184.0, - "2890": 1044200064.0, - "2895": 1036459136.0, - "2900": 1030652928.0, - "2905": 1035957376.0, - "2910": 1038718272.0, - "2915": 1039385408.0, - "2920": 1034781248.0, - "2925": 1043267840.0, - "2930": 1038229696.0, - "2935": 1021222144.0, - "2940": 1042307456.0, - "2945": 1045232384.0, - "2950": 1047525952.0, - "2955": 1034172928.0, - "2960": 1020891904.0, - "2965": 1027307840.0, - "2970": 1038796288.0, - "2975": 1034007296.0, - "2980": 1049590400.0, - "2985": 1034846016.0, - "2990": 1026008576.0, - "2995": 1034919296.0, - "3000": 1039017856.0, - "3005": 1038158848.0, - "3010": 1010907712.0, - "3015": 1044976064.0, - "3020": 1034050688.0, - "3025": 1037763840.0, - "3030": 1027722816.0, - "3035": 1041821056.0, - "3040": 1035311872.0, - "3045": 1027255296.0, - "3050": 1029708032.0, - "3055": 1028029568.0, - "3060": 1049976960.0, - "3065": 1024067200.0, - "3070": 1011545728.0, - "3075": 1042846272.0, - "3080": 1036094912.0, - "3085": 1030387456.0, - "3090": 1035262976.0, - "3095": 1013803008.0, - "3100": 1030144896.0, - "3105": 1017609088.0, - "3110": 1033370816.0, - "3115": 1023737728.0, - "3120": 1024877504.0, - "3125": 1046537216.0, - "3130": 1024676160.0, - "3135": 1025722496.0, - "3140": 1043778176.0, - "3145": 1044372672.0, - "3150": 1016483328.0, - "3155": 1042487936.0, - "3160": 1026834688.0, - "3165": 1031199360.0, - "3170": 1024332800.0, - "3175": 1024368640.0, - "3180": 1018204288.0, - "3185": 1034352512.0, - "3190": 1019221888.0, - "3195": 1028425408.0, - "3200": 1036080640.0, - "3205": 1016076160.0, - "3210": 1034109312.0, - "3215": 1031349312.0, - "3220": 1040833664.0, - "3225": 1022835008.0, - "3230": 1033255744.0, - "3235": 1019975488.0, - "3240": 1038131840.0, - "3245": 1031643136.0, - "3250": 1022390656.0, - "3255": 1032876672.0, - "3260": 1037751616.0, - "3265": 1021622656.0, - "3270": 1031242880.0, - "3275": 1038461184.0, - "3280": 1023236992.0, - "3285": 1031615424.0, - "3290": 1045247616.0, - "3295": 1043177536.0, - "3300": 1035084224.0, - "3305": 1042662400.0, - "3310": 1058092096.0, - "3315": 1024282880.0, - "3320": 1046015296.0, - "3325": 1023179008.0, - "3330": 1048037248.0, - "3335": 1036690560.0, - "3340": 1042123392.0, - "3345": 1030897920.0, - "3350": 1020621696.0, - "3355": 1025960576.0, - "3360": 1030305344.0, - "3365": 1031171520.0, - "3370": 1036454144.0, - "3375": 1023472384.0, - "3380": 1032383744.0, - "3385": 1038081536.0, - "3390": 1052811072.0, - "3395": 1012090496.0, - "3400": 1019209600.0, - "3405": 1021780224.0, - "3410": 1028433728.0, - "3415": 1058222400.0, - "3420": 1033492480.0, - "3425": 1029580352.0, - "3430": 1021150976.0, - "3435": 1034991872.0, - "3440": 1017961600.0, - "3445": 1025537280.0, - "3450": 1032254336.0, - "3455": 1036261312.0, - "3460": 1052071808.0, - "3465": 1027114240.0, - "3470": 1043729536.0, - "3475": 1033265792.0, - "3480": 1026619776.0, - "3485": 1029215232.0, - "3490": 1041041408.0, - "3495": 1019252224.0, - "3500": 1032059904.0, - "3505": 1025753728.0, - "3510": 1044367616.0, - "3515": 1013817280.0, - "3520": 1021846400.0, - "3525": 1032175552.0, - "3530": 1029789056.0, - "3535": 1034568704.0, - "3540": 1017731456.0, - "3545": 1035658880.0, - "3550": 1024535296.0, - "3555": 1035866112.0, - "3560": 1029737600.0, - "3565": 1028900160.0, - "3570": 1046029888.0, - "3575": 1039186304.0, - "3580": 1010838336.0, - "3585": 1031737728.0, - "3590": 1041450688.0, - "3595": 1037636800.0, - "3600": 1032763584.0, - "3605": 1045822272.0, - "3610": 1039235200.0, - "3615": 1036870144.0, - "3620": 1026929664.0, - "3625": 1033931136.0, - "3630": 1017582464.0, - "3635": 1026629056.0, - "3640": 1039529088.0, - "3645": 1022655872.0, - "3650": 1036842624.0, - "3655": 1023990144.0, - "3660": 1014987456.0, - "3665": 1026118784.0, - "3670": 1041672448.0, - "3675": 1033250304.0, - "3680": 1015353984.0, - "3685": 1029122304.0, - "3690": 1026204416.0, - "3695": 1043800832.0, - "3700": 1028613504.0, - "3705": 1049485312.0, - "3710": 1027180672.0, - "3715": 1016134912.0, - "3720": 1040818560.0, - "3725": 1032763776.0, - "3730": 1030920960.0, - "3735": 1019008640.0, - "3740": 1023825600.0, - "3745": 1046289152.0, - "3750": 1034462336.0, - "3755": 1032090048.0, - "3760": 1019366912.0, - "3765": 1031916736.0, - "3770": 1026677120.0, - "3775": 1035708288.0, - "3780": 1030671104.0, - "3785": 1027208128.0, - "3790": 1019584064.0, - "3795": 1030306048.0, - "3800": 1035614976.0, - "3805": 1035423360.0, - "3810": 1033294144.0, - "3815": 1033988608.0, - "3820": 1041105792.0, - "3825": 1024534976.0, - "3830": 1037630528.0, - "3835": 1040347968.0, - "3840": 1023445888.0, - "3845": 1048466688.0, - "3850": 1052489280.0, - "3855": 1028907264.0, - "3860": 1019532672.0, - "3865": 1035487744.0, - "3870": 1028491712.0, - "3875": 1041164800.0, - "3880": 1048854912.0, - "3885": 1027725248.0, - "3890": 1027487616.0, - "3895": 1034190592.0, - "3900": 1027645312.0, - "3905": 1027976128.0, - "3910": 1041572480.0, - "3915": 1043995392.0, - "3920": 1041063424.0, - "3925": 1030836160.0, - "3930": 1027072896.0, - "3935": 1033782016.0, - "3940": 1042275712.0, - "3945": 1036248064.0, - "3950": 1021430976.0, - "3955": 1036304128.0, - "3960": 1024184192.0, - "3965": 1027065856.0, - "3970": 1015984640.0, - "3975": 1041421632.0, - "3980": 1032455488.0, - "3985": 1037680640.0, - "3990": 1038684992.0, - "3995": 1023654528.0, - "4000": 1054410240.0, - "4005": 1029983424.0, - "4010": 1025138112.0, - "4015": 1030978560.0, - "4020": 1018472448.0, - "4025": 1027124352.0, - "4030": 1010306816.0, - "4035": 1038641088.0, - "4040": 1022256640.0, - "4045": 1025038208.0, - "4050": 1032348800.0, - "4055": 1022420864.0, - "4060": 1024520768.0, - "4065": 1032871168.0, - "4070": 1027791232.0, - "4075": 1025596928.0, - "4080": 1029366656.0, - "4085": 1020823552.0, - "4090": 1033322496.0, - "4095": 1024142656.0, - "4100": 1040948864.0, - "4105": 1027266496.0, - "4110": 1038791424.0, - "4115": 1023497088.0, - "4120": 1038943168.0, - "4125": 1048274176.0, - "4130": 1021490752.0, - "4135": 1034570880.0, - "4140": 1034613824.0, - "4145": 1044447232.0, - "4150": 1000353664.0, - "4155": 1028363392.0, - "4160": 1024242624.0, - "4165": 1033688704.0, - "4170": 1018888000.0, - "4175": 1026492608.0, - "4180": 1045409024.0, - "4185": 1033631616.0, - "4190": 1029574592.0, - "4195": 1038777984.0, - "4200": 1025102336.0, - "4205": 1019074816.0, - "4210": 1029560704.0, - "4215": 1032269184.0, - "4220": 1026242048.0, - "4225": 1031925888.0, - "4230": 1030269824.0, - "4235": 1027603328.0, - "4240": 1031480832.0, - "4245": 1028765056.0, - "4250": 1026987008.0, - "4255": 1021240064.0, - "4260": 1042082432.0, - "4265": 1025411200.0, - "4270": 1030169984.0, - "4275": 1012472448.0, - "4280": 1044505600.0, - "4285": 1019898304.0, - "4290": 1033058560.0, - "4295": 1033596032.0, - "4300": 1031638912.0, - "4305": 1023847936.0, - "4310": 1021568512.0, - "4315": 1047221504.0, - "4320": 1026520576.0, - "4325": 1005865600.0, - "4330": 1037666688.0, - "4335": 1022006464.0, - "4340": 1029009920.0, - "4345": 1033474496.0, - "4350": 1036886144.0, - "4355": 1026808832.0, - "4360": 1022938240.0, - "4365": 1028779648.0, - "4370": 1029624704.0, - "4375": 1042196864.0, - "4380": 1016100096.0, - "4385": 1045551296.0, - "4390": 1026270848.0, - "4395": 1029796416.0, - "4400": 1047365760.0, - "4405": 1029297344.0, - "4410": 1033424256.0, - "4415": 1028298304.0, - "4420": 1028148928.0, - "4425": 1033575552.0, - "4430": 1031374592.0, - "4435": 1028571136.0, - "4440": 1033123328.0, - "4445": 1028293504.0, - "4450": 1052210944.0, - "4455": 1026286080.0, - "4460": 1034885888.0, - "4465": 1031725696.0, - "4470": 1035446528.0, - "4475": 1036971712.0, - "4480": 1025117824.0, - "4485": 1034104960.0, - "4490": 1024630912.0, - "4495": 1047974912.0, - "4500": 1024707840.0, - "4505": 1038850048.0, - "4510": 1043723776.0, - "4515": 1044276736.0, - "4520": 1036872320.0, - "4525": 1058073536.0, - "4530": 1030973568.0, - "4535": 1032592256.0, - "4540": 1036428160.0, - "4545": 1025726400.0, - "4550": 1021749312.0, - "4555": 1037546112.0, - "4560": 1020099200.0, - "4565": 1036055296.0, - "4570": 1020501120.0, - "4575": 1050412608.0, - "4580": 1010437888.0, - "4585": 1022960768.0, - "4590": 1039710272.0, - "4595": 1023274880.0, - "4600": 1042477824.0, - "4605": 1039746688.0, - "4610": 1046104192.0, - "4615": 1017999744.0, - "4620": 1044734592.0, - "4625": 1030479104.0, - "4630": 1027260800.0, - "4635": 1026995200.0, - "4640": 1034901248.0, - "4645": 1036420352.0, - "4650": 1033711488.0, - "4655": 1035461056.0, - "4660": 1035324800.0, - "4665": 1020265664.0, - "4670": 1020057344.0, - "4675": 1054848768.0, - "4680": 1024895872.0, - "4685": 1027820160.0, - "4690": 1034449664.0, - "4695": 1039151744.0, - "4700": 1038865024.0, - "4705": 1027655808.0, - "4710": 1020522560.0, - "4715": 1031825536.0, - "4720": 1030300416.0, - "4725": 1030298368.0, - "4730": 1044096704.0, - "4735": 1046133376.0, - "4740": 1036178112.0, - "4745": 1039043840.0, - "4750": 1031790528.0, - "4755": 1047723392.0, - "4760": 1026178176.0, - "4765": 1034695040.0, - "4770": 1036521856.0, - "4775": 1029375168.0, - "4780": 1028543488.0, - "4785": 1028414976.0, - "4790": 1019620224.0, - "4795": 1033060160.0, - "4800": 1051866880.0, - "4805": 1015414400.0, - "4810": 1029454336.0, - "4815": 1009572096.0, - "4820": 1041051200.0, - "4825": 1026708608.0, - "4830": 1020450816.0, - "4835": 1051307840.0, - "4840": 1019456512.0, - "4845": 1032315008.0, - "4850": 1036794496.0, - "4855": 1031052736.0, - "4860": 1033131776.0, - "4865": 1032064384.0, - "4870": 1049832576.0, - "4875": 1025110528.0, - "4880": 1048476160.0, - "4885": 1016853056.0, - "4890": 1037317312.0, - "4895": 1024323136.0, - "4900": 1043374208.0, - "4905": 1033397120.0, - "4910": 1032830272.0, - "4915": 1016889856.0, - "4920": 1022294784.0, - "4925": 1034965888.0, - "4930": 1034630016.0, - "4935": 1025885312.0, - "4940": 1048398272.0, - "4945": 1025248576.0, - "4950": 1024208768.0, - "4955": 1007485952.0, - "4960": 1040213824.0, - "4965": 1018775296.0, - "4970": 1014274688.0, - "4975": 1038025472.0, - "4980": 1020917888.0, - "4985": 1029045888.0, - "4990": 1028394816.0, - "4995": 1032020480.0, - "5000": 1039791104.0, - "5005": 1024351552.0, - "5010": 1029147968.0, - "5015": 1021807296.0, - "5020": 1023506944.0, - "5025": 1037603456.0, - "5030": 1041947136.0, - "5035": 1047130304.0, - "5040": 1060956096.0, - "5045": 1032108544.0, - "5050": 1029534336.0, - "5055": 1024552192.0, - "5060": 1035282304.0, - "5065": 1021205504.0, - "5070": 1035756288.0, - "5075": 1015771264.0, - "5080": 1027040064.0, - "5085": 1021792192.0, - "5090": 1034973568.0, - "5095": 1015499712.0, - "5100": 1032257600.0, - "5105": 1017981568.0, - "5110": 1019586304.0, - "5115": 1036063936.0, - "5120": 1032695040.0, - "5125": 1019076992.0, - "5130": 1033404672.0, - "5135": 1041203072.0, - "5140": 1026258752.0, - "5145": 1033705856.0, - "5150": 1022043520.0, - "5155": 1032265664.0, - "5160": 1039625984.0, - "5165": 1031576448.0, - "5170": 1035555328.0, - "5175": 1026116224.0, - "5180": 1030316032.0, - "5185": 1024495680.0, - "5190": 1019492608.0, - "5195": 1035626496.0, - "5200": 1016905344.0, - "5205": 1013435648.0, - "5210": 1049395456.0, - "5215": 1030833280.0, - "5220": 1025276800.0, - "5225": 1035239936.0, - "5230": 1025930624.0, - "5235": 1025120000.0, - "5240": 1046308224.0, - "5245": 1022740608.0, - "5250": 1027062336.0, - "5255": 1023887360.0, - "5260": 1033821440.0, - "5265": 1045733696.0, - "5270": 1052500480.0, - "5275": 1033018112.0, - "5280": 1030073920.0, - "5285": 1025212608.0, - "5290": 1026575616.0, - "5295": 1032653440.0, - "5300": 1024367872.0, - "5305": 1029634368.0, - "5310": 1033197312.0, - "5315": 1032988992.0, - "5320": 1019521664.0, - "5325": 1022718336.0, - "5330": 1021335168.0, - "5335": 1039275776.0, - "5340": 1037219648.0, - "5345": 1039188096.0, - "5350": 1023701888.0, - "5355": 1029935872.0, - "5360": 1047046080.0, - "5365": 1037426432.0, - "5370": 1024381568.0, - "5375": 1042070656.0, - "5380": 1020368384.0, - "5385": 1021765696.0, - "5390": 1035133184.0, - "5395": 1049653568.0, - "5400": 1026015744.0, - "5405": 1036453120.0, - "5410": 1027635776.0, - "5415": 1042285824.0, - "5420": 1039941888.0, - "5425": 1028381184.0, - "5430": 1043799808.0, - "5435": 1032653312.0, - "5440": 1033384448.0, - "5445": 1034144640.0, - "5450": 1025299328.0, - "5455": 1034079424.0, - "5460": 1026812416.0, - "5465": 1027399552.0, - "5470": 1028969216.0, - "5475": 1037233920.0, - "5480": 1023830272.0, - "5485": 1019186752.0, - "5490": 1030891520.0, - "5495": 1029399424.0, - "5500": 1032681216.0, - "5505": 1018275200.0, - "5510": 1023987648.0, - "5515": 1025156032.0, - "5520": 1039527296.0, - "5525": 1018024576.0, - "5530": 1037663936.0, - "5535": 1031599232.0, - "5540": 1027564544.0, - "5545": 1033212160.0, - "5550": 1032115968.0, - "5555": 1044802304.0, - "5560": 1028511232.0, - "5565": 1029686016.0, - "5570": 1042027776.0, - "5575": 1025379392.0, - "5580": 1023716736.0, - "5585": 1044093696.0, - "5590": 1041319936.0, - "5595": 1031549824.0, - "5600": 1023400320.0, - "5605": 1040115456.0, - "5610": 1034087552.0, - "5615": 1021042816.0, - "5620": 1031004800.0, - "5625": 1030188544.0, - "5630": 1023502080.0, - "5635": 1026684096.0, - "5640": 1034589120.0, - "5645": 1018655744.0, - "5650": 1052378752.0, - "5655": 1048933504.0, - "5660": 1050077696.0, - "5665": 1033958144.0, - "5670": 1033750016.0, - "5675": 1025392640.0, - "5680": 1039378304.0, - "5685": 1033056576.0, - "5690": 1031464576.0, - "5695": 1021946368.0, - "5700": 1038065664.0, - "5705": 1043684736.0, - "5710": 1057231616.0, - "5715": 1014462848.0, - "5720": 1021258816.0, - "5725": 1041822272.0, - "5730": 1039454912.0, - "5735": 1025128576.0, - "5740": 1026045440.0, - "5745": 1036990208.0, - "5750": 1044552256.0, - "5755": 1011860416.0, - "5760": 1028389568.0, - "5765": 1028245504.0, - "5770": 1021530368.0, - "5775": 1051210240.0, - "5780": 1034984512.0, - "5785": 1037513920.0, - "5790": 1016957184.0, - "5795": 1027873536.0, - "5800": 1029780736.0, - "5805": 1050694912.0, - "5810": 1018478336.0, - "5815": 1036123520.0, - "5820": 1048408704.0, - "5825": 1030977920.0, - "5830": 1031572096.0, - "5835": 1034045440.0, - "5840": 1039843776.0, - "5845": 1021746048.0, - "5850": 1029807744.0, - "5855": 1038789376.0, - "5860": 1031436288.0, - "5865": 1026397568.0, - "5870": 1029861824.0, - "5875": 1032841856.0, - "5880": 1032675968.0, - "5885": 1024576128.0, - "5890": 1026798976.0, - "5895": 1015796160.0, - "5900": 1049707008.0, - "5905": 1025653248.0, - "5910": 1019150720.0, - "5915": 1042739136.0, - "5920": 1028047232.0, - "5925": 1034016448.0, - "5930": 1030963328.0, - "5935": 1038102784.0, - "5940": 1019172864.0, - "5945": 1025130112.0, - "5950": 1035530240.0, - "5955": 1050437184.0, - "5960": 1024548736.0, - "5965": 1029923712.0, - "5970": 1016427776.0, - "5975": 1036682752.0, - "5980": 1024118464.0, - "5985": 1035386624.0, - "5990": 1010550784.0, - "5995": 1047019200.0, - "6000": 1021245568.0, - "6005": 1040460416.0, - "6010": 1025358720.0, - "6015": 1050179072.0, - "6020": 1039514496.0, - "6025": 1030254592.0, - "6030": 1025931968.0, - "6035": 1021745408.0, - "6040": 1034117056.0, - "6045": 1028282112.0, - "6050": 1020112320.0, - "6055": 1040397056.0, - "6060": 1026347008.0, - "6065": 1022198400.0, - "6070": 1040668416.0, - "6075": 1046037440.0, - "6080": 1038583168.0, - "6085": 1041485568.0, - "6090": 1037205888.0, - "6095": 1036282880.0, - "6100": 1030454720.0, - "6105": 1019216640.0, - "6110": 1035357824.0, - "6115": 1019452544.0, - "6120": 1032188800.0, - "6125": 1020922624.0, - "6130": 1012013952.0, - "6135": 1038733824.0, - "6140": 1041736896.0, - "6145": 1041917056.0, - "6150": 1018958208.0, - "6155": 1024649344.0, - "6160": 1047972160.0, - "6165": 1050408832.0, - "6170": 1032505344.0, - "6175": 1045793664.0, - "6180": 1040067072.0, - "6185": 1029710464.0, - "6190": 1023293760.0, - "6195": 1050897728.0, - "6200": 1035035776.0, - "6205": 1036275584.0, - "6210": 1039772736.0, - "6215": 1033200256.0, - "6220": 1026162432.0, - "6225": 1036741120.0, - "6230": 1025144192.0, - "6235": 1019352832.0, - "6240": 1057104384.0, - "6245": 1018413952.0, - "6250": 1035337344.0, - "6255": 1025380992.0, - "6260": 1034863744.0, - "6265": 1027703424.0, - "6270": 1042116480.0, - "6275": 1037659008.0, - "6280": 1018270208.0, - "6285": 1032642304.0, - "6290": 1038598592.0, - "6295": 1031803456.0, - "6300": 1034635200.0, - "6305": 1011066624.0, - "6310": 1039458624.0, - "6315": 1030054272.0, - "6320": 1030534208.0, - "6325": 1038642496.0, - "6330": 1033908800.0, - "6335": 1032297856.0, - "6340": 1033544448.0, - "6345": 1031036416.0, - "6350": 1037451264.0, - "6355": 1028075968.0, - "6360": 1043313408.0, - "6365": 1025223808.0, - "6370": 1033939200.0, - "6375": 1036038720.0, - "6380": 1029108096.0, - "6385": 1025395072.0, - "6390": 1025517952.0, - "6395": 1048611584.0, - "6400": 1040734976.0, - "6405": 1024247936.0, - "6410": 1017489280.0, - "6415": 1042827072.0, - "6420": 1025202432.0, - "6425": 1027164928.0, - "6430": 1040568256.0, - "6435": 1022908800.0, - "6440": 1047994624.0, - "6445": 1036089088.0, - "6450": 1048532224.0, - "6455": 1037272320.0, - "6460": 1036750912.0, - "6465": 1033652032.0, - "6470": 1018135232.0, - "6475": 1034691648.0, - "6480": 1028994048.0, - "6485": 1033258880.0, - "6490": 1035638656.0, - "6495": 1024470016.0, - "6500": 1020572096.0, - "6505": 1059327104.0, - "6510": 1020472576.0, - "6515": 1018688064.0, - "6520": 1051470592.0, - "6525": 1035544512.0, - "6530": 1027897216.0, - "6535": 1022722240.0, - "6540": 1023273984.0, - "6545": 1033173120.0, - "6550": 1029488512.0, - "6555": 1029575296.0, - "6560": 1056438784.0, - "6565": 1054295040.0, - "6570": 1032319040.0, - "6575": 1041208320.0, - "6580": 1028134400.0, - "6585": 1036504832.0, - "6590": 1042456192.0, - "6595": 1038568832.0, - "6600": 1031388096.0, - "6605": 1045715456.0, - "6610": 1034713472.0, - "6615": 1015576448.0, - "6620": 1039115136.0, - "6625": 1054654208.0, - "6630": 1043092928.0, - "6635": 1032226304.0, - "6640": 1016738496.0, - "6645": 1016178816.0, - "6650": 1034692672.0, - "6655": 1031753472.0, - "6660": 1041401920.0, - "6665": 1024657984.0, - "6670": 1023820032.0, - "6675": 1038306176.0, - "6680": 1025624064.0, - "6685": 1045394048.0, - "6690": 1046390720.0, - "6695": 1027754368.0, - "6700": 1033473920.0, - "6705": 1038857152.0, - "6710": 1047485888.0, - "6715": 1043229440.0, - "6720": 1022995456.0, - "6725": 1018910144.0, - "6730": 1027525504.0, - "6735": 1016937856.0, - "6740": 1027238016.0, - "6745": 1030263680.0, - "6750": 1006373760.0, - "6755": 1034765056.0, - "6760": 1040735296.0, - "6765": 1023827008.0, - "6770": 1036441344.0, - "6775": 1019627712.0, - "6780": 1043723904.0, - "6785": 1037409280.0, - "6790": 1029403072.0, - "6795": 1026349440.0, - "6800": 1036628224.0, - "6805": 1024579712.0, - "6810": 1042340544.0, - "6815": 1035274112.0, - "6820": 1022594880.0, - "6825": 1034793344.0, - "6830": 1029862400.0, - "6835": 1041609600.0, - "6840": 1042283776.0, - "6845": 1018954624.0, - "6850": 1032171136.0, - "6855": 1034434752.0, - "6860": 1042054848.0, - "6865": 1021813568.0, - "6870": 1037015424.0, - "6875": 1030379968.0, - "6880": 1029360768.0, - "6885": 1030435968.0, - "6890": 1039890432.0, - "6895": 1027267712.0, - "6900": 1035174016.0, - "6905": 1043975424.0, - "6910": 1019763072.0, - "6915": 1017476608.0, - "6920": 1017184256.0, - "6925": 1030650688.0, - "6930": 1036672384.0, - "6935": 1042835712.0, - "6940": 1040313216.0, - "6945": 1044196992.0, - "6950": 1040513472.0, - "6955": 1036112704.0, - "6960": 1036436224.0, - "6965": 1019161024.0, - "6970": 1034729088.0, - "6975": 1019134464.0, - "6980": 1028436160.0, - "6985": 1023240128.0, - "6990": 1026994688.0, - "6995": 1027547520.0, - "7000": 1058819840.0, - "7005": 1013737856.0, - "7010": 1028959488.0, - "7015": 1037288768.0, - "7020": 1011880576.0, - "7025": 1017313280.0, - "7030": 1028301440.0, - "7035": 1035955392.0, - "7040": 1042966016.0, - "7045": 1028185856.0, - "7050": 1017979584.0, - "7055": 1035088000.0, - "7060": 1051802624.0, - "7065": 1007664640.0, - "7070": 1035819008.0, - "7075": 1031039552.0, - "7080": 1026143296.0, - "7085": 1044906432.0, - "7090": 1046261760.0, - "7095": 1043760512.0, - "7100": 1035089024.0, - "7105": 1049143296.0, - "7110": 1010962944.0, - "7115": 1033869504.0, - "7120": 1031267456.0, - "7125": 1037496832.0, - "7130": 1024881856.0, - "7135": 1031991808.0, - "7140": 1019090176.0, - "7145": 1033081088.0, - "7150": 1037554112.0, - "7155": 1015729728.0, - "7160": 1024724608.0, - "7165": 1030895808.0, - "7170": 1037367808.0, - "7175": 1028816896.0, - "7180": 1037633280.0, - "7185": 1016174080.0, - "7190": 1019808128.0, - "7195": 1040915392.0, - "7200": 1041375360.0, - "7205": 1026538240.0, - "7210": 1022638720.0, - "7215": 1041890560.0, - "7220": 1017742720.0, - "7225": 1027296640.0, - "7230": 1030200448.0, - "7235": 1035726848.0, - "7240": 1037854848.0, - "7245": 1023971008.0, - "7250": 1044708096.0, - "7255": 1031900480.0, - "7260": 1030128256.0, - "7265": 1036887104.0, - "7270": 1050097152.0, - "7275": 1029225216.0, - "7280": 1020231808.0, - "7285": 1029842048.0, - "7290": 1017219328.0, - "7295": 1029139584.0, - "7300": 1031533824.0, - "7305": 1027298176.0, - "7310": 1029089664.0, - "7315": 1022782272.0, - "7320": 1036458176.0, - "7325": 1036851840.0, - "7330": 1021706496.0, - "7335": 1030715904.0, - "7340": 1039382976.0, - "7345": 1040177664.0, - "7350": 1034973568.0, - "7355": 1033656320.0, - "7360": 1031254912.0, - "7365": 1048742016.0, - "7370": 1027298304.0, - "7375": 1041854848.0, - "7380": 1016725760.0, - "7385": 1017578368.0, - "7390": 1017234944.0, - "7395": 1046793600.0, - "7400": 1048441216.0, - "7405": 1013394304.0, - "7410": 1017386368.0, - "7415": 1017815360.0, - "7420": 1028043008.0, - "7425": 1012840576.0, - "7430": 1034042368.0, - "7435": 1032530432.0, - "7440": 1002692928.0, - "7445": 1034451200.0, - "7450": 1039304832.0, - "7455": 1019027008.0, - "7460": 1014740928.0, - "7465": 1027204736.0, - "7470": 1030422784.0, - "7475": 1033792064.0, - "7480": 1043317376.0, - "7485": 1038215168.0, - "7490": 1049000960.0, - "7495": 1028982720.0, - "7500": 1027426816.0, - "7505": 1028695936.0, - "7510": 1048886528.0, - "7515": 1035648704.0, - "7520": 1017198848.0, - "7525": 1036572736.0, - "7530": 1029261952.0, - "7535": 1027190144.0, - "7540": 1028338048.0, - "7545": 1025986304.0, - "7550": 1023025856.0, - "7555": 1033025344.0, - "7560": 1031404672.0, - "7565": 1022710528.0, - "7570": 1037591552.0, - "7575": 1022603136.0, - "7580": 1018123584.0, - "7585": 1033054208.0, - "7590": 1010993280.0, - "7595": 1018260352.0, - "7600": 1049904448.0, - "7605": 1037361216.0, - "7610": 1040415744.0, - "7615": 1035247488.0, - "7620": 1024230912.0, - "7625": 1020317184.0, - "7630": 1034939584.0, - "7635": 1043224192.0, - "7640": 1033491520.0, - "7645": 1034444608.0, - "7650": 1039804800.0, - "7655": 1031240576.0, - "7660": 1056628096.0, - "7665": 1031076096.0, - "7670": 1033685120.0, - "7675": 1030681600.0, - "7680": 1035398720.0, - "7685": 1018661760.0, - "7690": 1031921024.0, - "7695": 1025858880.0, - "7700": 1017715200.0, - "7705": 1036531200.0, - "7710": 1029893248.0, - "7715": 1053230656.0, - "7720": 1019514240.0, - "7725": 1042193216.0, - "7730": 1035620992.0, - "7735": 1020726144.0, - "7740": 1045576128.0, - "7745": 1026932992.0, - "7750": 1048550208.0, - "7755": 1022539264.0, - "7760": 1049532032.0, - "7765": 1029370176.0, - "7770": 1018375296.0, - "7775": 1021364672.0, - "7780": 1039770624.0, - "7785": 1039914112.0, - "7790": 1030516992.0, - "7795": 1039353728.0, - "7800": 1028187904.0, - "7805": 1027635776.0, - "7810": 1020970368.0, - "7815": 1035878400.0, - "7820": 1017666240.0, - "7825": 1018067392.0, - "7830": 1035104128.0, - "7835": 1044507648.0, - "7840": 1027836224.0, - "7845": 1032101504.0, - "7850": 1034609408.0, - "7855": 1025464832.0, - "7860": 1059051648.0, - "7865": 1016626240.0, - "7870": 1033729408.0, - "7875": 1044185600.0, - "7880": 1029084352.0, - "7885": 1040308288.0, - "7890": 1029556480.0, - "7895": 1032947008.0, - "7900": 1021409216.0, - "7905": 1020955904.0, - "7910": 1008993856.0, - "7915": 1023120768.0, - "7920": 1023070976.0, - "7925": 1030094080.0, - "7930": 1020712704.0, - "7935": 1019443776.0, - "7940": 1017809152.0, - "7945": 1014447552.0, - "7950": 1026303616.0, - "7955": 1034518272.0, - "7960": 1056026304.0, - "7965": 1031047872.0, - "7970": 1030417152.0, - "7975": 1022189888.0, - "7980": 1034474624.0, - "7985": 1047305024.0, - "7990": 1032066176.0, - "7995": 1044264704.0, - "8000": 1028876672.0, - "8005": 1028045440.0, - "8010": 1050665408.0, - "8015": 1019758976.0, - "8020": 1043297408.0, - "8025": 1039018560.0, - "8030": 1030868800.0, - "8035": 1045304192.0, - "8040": 1026310784.0, - "8045": 1024970368.0, - "8050": 1018405632.0, - "8055": 1033736960.0, - "8060": 1012986816.0, - "8065": 1022016640.0, - "8070": 1034776064.0, - "8075": 1042759616.0, - "8080": 1027758784.0, - "8085": 1037205376.0, - "8090": 1007008256.0, - "8095": 1030374528.0, - "8100": 1030726016.0, - "8105": 1027794944.0, - "8110": 1031557248.0, - "8115": 1037685248.0, - "8120": 1037692992.0, - "8125": 1031097472.0, - "8130": 1028627072.0, - "8135": 1029680256.0, - "8140": 1049904256.0, - "8145": 1043463552.0, - "8150": 1040087424.0, - "8155": 1046780288.0, - "8160": 1010199040.0, - "8165": 1031657728.0, - "8170": 1024483264.0, - "8175": 1035019648.0, - "8180": 1024460544.0, - "8185": 1021960448.0, - "8190": 1037125504.0, - "8195": 1022368384.0, - "8200": 1035635968.0, - "8205": 1026482496.0, - "8210": 1023888000.0, - "8215": 1014276416.0, - "8220": 1026756224.0, - "8225": 1028540160.0, - "8230": 1027163072.0, - "8235": 1037914048.0, - "8240": 1025909376.0, - "8245": 1024676608.0, - "8250": 1041635840.0, - "8255": 1031908224.0, - "8260": 1032424512.0, - "8265": 1023164800.0, - "8270": 1040172544.0, - "8275": 1038050688.0, - "8280": 1041849216.0, - "8285": 1038804352.0, - "8290": 1024074880.0, - "8295": 1028403648.0, - "8300": 1039341440.0, - "8305": 1012104192.0, - "8310": 1021882048.0, - "8315": 1027307200.0, - "8320": 1021636992.0, - "8325": 1048572160.0, - "8330": 1041039616.0, - "8335": 1037964928.0, - "8340": 1033019136.0, - "8345": 1043864192.0, - "8350": 1037713792.0, - "8355": 1029686400.0, - "8360": 1040667776.0, - "8365": 1027450304.0, - "8370": 1037742848.0, - "8375": 1041986944.0, - "8380": 1037628416.0, - "8385": 1023436160.0, - "8390": 1026068224.0, - "8395": 1028913408.0, - "8400": 1046530560.0, - "8405": 1040179456.0, - "8410": 1034252672.0, - "8415": 1040258688.0, - "8420": 1054730752.0, - "8425": 1031514880.0, - "8430": 1030295680.0, - "8435": 1045707200.0, - "8440": 1026310784.0, - "8445": 1029027392.0, - "8450": 1034201920.0, - "8455": 1031794688.0, - "8460": 1016828032.0, - "8465": 1035163648.0, - "8470": 1035185152.0, - "8475": 1024712960.0, - "8480": 1035901184.0, - "8485": 1028948480.0, - "8490": 1023079168.0, - "8495": 1037393280.0, - "8500": 1025960064.0, - "8505": 1042724992.0, - "8510": 1028167936.0, - "8515": 1038101056.0, - "8520": 1023107328.0, - "8525": 1037987328.0, - "8530": 1027572800.0, - "8535": 1041656128.0, - "8540": 1033880960.0, - "8545": 1015116160.0, - "8550": 1040188160.0, - "8555": 1016340672.0, - "8560": 1019330048.0, - "8565": 1021410112.0, - "8570": 1032032320.0, - "8575": 1031880128.0, - "8580": 1016011264.0, - "8585": 1030017408.0, - "8590": 1031637248.0, - "8595": 1017776128.0, - "8600": 1002393216.0, - "8605": 1030238336.0, - "8610": 1017532288.0, - "8615": 1023989248.0, - "8620": 1047205696.0, - "8625": 1034231552.0, - "8630": 1030921280.0, - "8635": 1051992512.0, - "8640": 1041134208.0, - "8645": 1024870720.0, - "8650": 1025595392.0, - "8655": 1036904832.0, - "8660": 1031171200.0, - "8665": 1032904640.0, - "8670": 1037400576.0, - "8675": 1029157248.0, - "8680": 1031264704.0, - "8685": 1041197568.0, - "8690": 1035035392.0, - "8695": 1008508416.0, - "8700": 1027459072.0, - "8705": 1051504896.0, - "8710": 1041678016.0, - "8715": 1034152256.0, - "8720": 1017596544.0, - "8725": 1025187456.0, - "8730": 1036610816.0, - "8735": 1014829568.0, - "8740": 1036081536.0, - "8745": 1021252416.0, - "8750": 1027866496.0, - "8755": 1020742272.0, - "8760": 1036899712.0, - "8765": 1058672448.0, - "8770": 1020462464.0, - "8775": 1031773056.0, - "8780": 1030892544.0, - "8785": 1032117504.0, - "8790": 1041034112.0, - "8795": 1019523968.0, - "8800": 1038245632.0, - "8805": 1035106752.0, - "8810": 1043257088.0, - "8815": 1026490496.0, - "8820": 1027666944.0, - "8825": 1043464064.0, - "8830": 1027480192.0, - "8835": 1038812928.0, - "8840": 1034490752.0, - "8845": 1033909760.0, - "8850": 1030491008.0, - "8855": 1042524992.0, - "8860": 1013002880.0, - "8865": 1038368128.0, - "8870": 1025187456.0, - "8875": 1012981760.0, - "8880": 1028376704.0, - "8885": 1046461056.0, - "8890": 1038603840.0, - "8895": 1037909504.0, - "8900": 1027294848.0, - "8905": 1032792064.0, - "8910": 1029795264.0, - "8915": 1030003968.0, - "8920": 1030339968.0, - "8925": 1028569984.0, - "8930": 1031637376.0, - "8935": 1022951424.0, - "8940": 1019847872.0, - "8945": 1031909248.0, - "8950": 1039951744.0, - "8955": 1041902720.0, - "8960": 1026878464.0, - "8965": 1022083968.0, - "8970": 1029559424.0, - "8975": 1038934400.0, - "8980": 1033860160.0, - "8985": 1030649472.0, - "8990": 1025014144.0, - "8995": 1013963648.0, - "9000": 1035286400.0, - "9005": 1028649280.0, - "9010": 1011913280.0, - "9015": 1038912128.0, - "9020": 1030153856.0, - "9025": 1024685056.0, - "9030": 1025861888.0, - "9035": 1054309248.0, - "9040": 1027293952.0, - "9045": 1036583040.0, - "9050": 1020929664.0, - "9055": 1043212800.0, - "9060": 1023159104.0, - "9065": 1023387520.0, - "9070": 1039364480.0, - "9075": 1026728320.0, - "9080": 1018873408.0, - "9085": 1015439104.0, - "9090": 1043764736.0, - "9095": 1014020224.0, - "9100": 1031975296.0, - "9105": 1026514304.0, - "9110": 1029229568.0, - "9115": 1024866432.0, - "9120": 999986240.0, - "9125": 1032842752.0, - "9130": 1038534336.0, - "9135": 1031037696.0, - "9140": 1025502208.0, - "9145": 1030405248.0, - "9150": 1029416576.0, - "9155": 1038268928.0, - "9160": 1046043904.0, - "9165": 1017948992.0, - "9170": 1040955520.0, - "9175": 1031287552.0, - "9180": 1037830656.0, - "9185": 1040684416.0, - "9190": 1028985728.0, - "9195": 1034312320.0, - "9200": 1035551872.0, - "9205": 1029847040.0, - "9210": 1026535872.0, - "9215": 1030520448.0, - "9220": 1025732224.0, - "9225": 1048001408.0, - "9230": 1041601792.0, - "9235": 1027775104.0, - "9240": 1025245760.0, - "9245": 1036211584.0, - "9250": 1041192384.0, - "9255": 1020063872.0, - "9260": 1035337984.0, - "9265": 1023102208.0, - "9270": 1038332928.0, - "9275": 1036053568.0, - "9280": 1026541504.0, - "9285": 1014285184.0, - "9290": 1018866304.0, - "9295": 1026915264.0, - "9300": 1037085888.0, - "9305": 1045435392.0, - "9310": 1033242944.0, - "9315": 1039043840.0, - "9320": 1048495488.0, - "9325": 1023059840.0, - "9330": 1031724672.0, - "9335": 1035673472.0, - "9340": 1013719296.0, - "9345": 1022572032.0, - "9350": 1026585600.0, - "9355": 1034807104.0, - "9360": 1029839552.0, - "9365": 1019863296.0, - "9370": 1006904320.0, - "9375": 1036232960.0, - "9380": 1049012736.0, - "9385": 1015905344.0, - "9390": 1029208704.0, - "9395": 1008931968.0, - "9400": 1026893568.0, - "9405": 1027653312.0, - "9410": 1040913280.0, - "9415": 1035128576.0, - "9420": 1030792640.0, - "9425": 1027581056.0, - "9430": 1032727360.0, - "9435": 1031796288.0, - "9440": 1051730048.0, - "9445": 1019626752.0, - "9450": 1044505152.0, - "9455": 1035773696.0, - "9460": 1013828224.0, - "9465": 1023403904.0, - "9470": 1023576832.0, - "9475": 1039164416.0, - "9480": 1029597056.0, - "9485": 1032075200.0, - "9490": 1020994560.0, - "9495": 1021375616.0, - "9500": 1035594304.0, - "9505": 1034478464.0, - "9510": 1014286592.0, - "9515": 1031309312.0, - "9520": 1026563904.0, - "9525": 1035853184.0, - "9530": 1031624448.0, - "9535": 1025926720.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 33307314176.0, - "5": 33307424768.0, - "10": 33307447296.0, - "15": 33307439104.0, - "20": 33307533312.0, - "25": 33307473920.0, - "30": 33307504640.0, - "35": 33307639808.0, - "40": 33307637760.0, - "45": 33307568128.0, - "50": 33307418624.0, - "55": 33307326464.0, - "60": 33307346944.0, - "65": 33307490304.0, - "70": 33307312128.0, - "75": 33307308032.0, - "80": 33307404288.0, - "85": 33307314176.0, - "90": 33307285504.0, - "95": 33307392000.0, - "100": 33307260928.0, - "105": 33307129856.0, - "110": 33307037696.0, - "115": 33306703872.0, - "120": 33307355136.0, - "125": 33306873856.0, - "130": 33307017216.0, - "135": 33307305984.0, - "140": 33307004928.0, - "145": 33307121664.0, - "150": 33307312128.0, - "155": 33307176960.0, - "160": 33307103232.0, - "165": 33307174912.0, - "170": 33307832320.0, - "175": 33307199488.0, - "180": 33307355136.0, - "185": 33307355136.0, - "190": 33307131904.0, - "195": 33307256832.0, - "200": 33307326464.0, - "205": 33307492352.0, - "210": 33307500544.0, - "215": 33307086848.0, - "220": 33306857472.0, - "225": 33306933248.0, - "230": 33307092992.0, - "235": 33307183104.0, - "240": 33307303936.0, - "245": 33307426816.0, - "250": 33307308032.0, - "255": 33307295744.0, - "260": 33306767360.0, - "265": 33307461632.0, - "270": 33307467776.0, - "275": 33307469824.0, - "280": 33307254784.0, - "285": 33307947008.0, - "290": 33307191296.0, - "295": 33308014592.0, - "300": 33307856896.0, - "305": 33308340224.0, - "310": 33307815936.0, - "315": 33307181056.0, - "320": 33307512832.0, - "325": 33307488256.0, - "330": 33307977728.0, - "335": 33307947008.0, - "340": 33308606464.0, - "345": 33308037120.0, - "350": 33307693056.0, - "355": 33308000256.0, - "360": 33307348992.0, - "365": 33307451392.0, - "370": 33308000256.0, - "375": 33307283456.0, - "380": 33307570176.0, - "385": 33307860992.0, - "390": 33307416576.0, - "395": 33307031552.0, - "400": 33307246592.0, - "405": 33307676672.0, - "410": 33306935296.0, - "415": 33307752448.0, - "420": 33307529216.0, - "425": 33307314176.0, - "430": 33306988544.0, - "435": 33307455488.0, - "440": 33307369472.0, - "445": 33307709440.0, - "450": 33307588608.0, - "455": 33306963968.0, - "460": 33307193344.0, - "465": 33306845184.0, - "470": 33307766784.0, - "475": 33306464256.0, - "480": 33307566080.0, - "485": 33307682816.0, - "490": 33307389952.0, - "495": 33307179008.0, - "500": 33307969536.0, - "505": 33307629568.0, - "510": 33308192768.0, - "515": 33307279360.0, - "520": 33306544128.0, - "525": 33307265024.0, - "530": 33307025408.0, - "535": 33307648000.0, - "540": 33307582464.0, - "545": 33307297792.0, - "550": 33307396096.0, - "555": 33307301888.0, - "560": 33307899904.0, - "565": 33307379712.0, - "570": 33307553792.0, - "575": 33307136000.0, - "580": 33305892864.0, - "585": 33306945536.0, - "590": 33307629568.0, - "595": 33307860992.0, - "600": 33306873856.0, - "605": 33307357184.0, - "610": 33306556416.0, - "615": 33306349568.0, - "620": 33307791360.0, - "625": 33306378240.0, - "630": 33307168768.0, - "635": 33306767360.0, - "640": 33306116096.0, - "645": 33308092416.0, - "650": 33307277312.0, - "655": 33307131904.0, - "660": 33308485632.0, - "665": 33307334656.0, - "670": 33307959296.0, - "675": 33307701248.0, - "680": 33306863616.0, - "685": 33306697728.0, - "690": 33307863040.0, - "695": 33307293696.0, - "700": 33306263552.0, - "705": 33306955776.0, - "710": 33308225536.0, - "715": 33307174912.0, - "720": 33307107328.0, - "725": 33307324416.0, - "730": 33308231680.0, - "735": 33307224064.0, - "740": 33307815936.0, - "745": 33307938816.0, - "750": 33307779072.0, - "755": 33308463104.0, - "760": 33306349568.0, - "765": 33308266496.0, - "770": 33306603520.0, - "775": 33307424768.0, - "780": 33308608512.0, - "785": 33307969536.0, - "790": 33308188672.0, - "795": 33307656192.0, - "800": 33307547648.0, - "805": 33307619328.0, - "810": 33307910144.0, - "815": 33307170816.0, - "820": 33307029504.0, - "825": 33307443200.0, - "830": 33307422720.0, - "835": 33307262976.0, - "840": 33307613184.0, - "845": 33307928576.0, - "850": 33306238976.0, - "855": 33307396096.0, - "860": 33307938816.0, - "865": 33307701248.0, - "870": 33307940864.0, - "875": 33307545600.0, - "880": 33307527168.0, - "885": 33307336704.0, - "890": 33308262400.0, - "895": 33307717632.0, - "900": 33306474496.0, - "905": 33307480064.0, - "910": 33307725824.0, - "915": 33308303360.0, - "920": 33307770880.0, - "925": 33307566080.0, - "930": 33307451392.0, - "935": 33307975680.0, - "940": 33306320896.0, - "945": 33306429440.0, - "950": 33307136000.0, - "955": 33307846656.0, - "960": 33307611136.0, - "965": 33307465728.0, - "970": 33308293120.0, - "975": 33307078656.0, - "980": 33307568128.0, - "985": 33307080704.0, - "990": 33307367424.0, - "995": 33306861568.0, - "1000": 33307889664.0, - "1005": 33305956352.0, - "1010": 33307508736.0, - "1015": 33306671104.0, - "1020": 33306669056.0, - "1025": 33306509312.0, - "1030": 33307117568.0, - "1035": 33308332032.0, - "1040": 33307353088.0, - "1045": 33308368896.0, - "1050": 33306615808.0, - "1055": 33306802176.0, - "1060": 33307103232.0, - "1065": 33307404288.0, - "1070": 33307070464.0, - "1075": 33308188672.0, - "1080": 33307011072.0, - "1085": 33307027456.0, - "1090": 33308086272.0, - "1095": 33307086848.0, - "1100": 33307287552.0, - "1105": 33308497920.0, - "1110": 33307461632.0, - "1115": 33307533312.0, - "1120": 33307777024.0, - "1125": 33307809792.0, - "1130": 33307484160.0, - "1135": 33308082176.0, - "1140": 33307029504.0, - "1145": 33307432960.0, - "1150": 33307574272.0, - "1155": 33307551744.0, - "1160": 33307561984.0, - "1165": 33307086848.0, - "1170": 33307856896.0, - "1175": 33306976256.0, - "1180": 33308237824.0, - "1185": 33307875328.0, - "1190": 33307369472.0, - "1195": 33308231680.0, - "1200": 33307197440.0, - "1205": 33307480064.0, - "1210": 33305866240.0, - "1215": 33308297216.0, - "1220": 33307451392.0, - "1225": 33307518976.0, - "1230": 33307688960.0, - "1235": 33307901952.0, - "1240": 33307394048.0, - "1245": 33307842560.0, - "1250": 33307281408.0, - "1255": 33306906624.0, - "1260": 33307301888.0, - "1265": 33307674624.0, - "1270": 33307150336.0, - "1275": 33307686912.0, - "1280": 33307430912.0, - "1285": 33306974208.0, - "1290": 33307529216.0, - "1295": 33307901952.0, - "1300": 33307002880.0, - "1305": 33308059648.0, - "1310": 33306939392.0, - "1315": 33307336704.0, - "1320": 33307262976.0, - "1325": 33307011072.0, - "1330": 33306550272.0, - "1335": 33307181056.0, - "1340": 33307406336.0, - "1345": 33307463680.0, - "1350": 33308135424.0, - "1355": 33307480064.0, - "1360": 33307533312.0, - "1365": 33307066368.0, - "1370": 33306595328.0, - "1375": 33307891712.0, - "1380": 33307830272.0, - "1385": 33308487680.0, - "1390": 33306521600.0, - "1395": 33307338752.0, - "1400": 33308430336.0, - "1405": 33307768832.0, - "1410": 33308041216.0, - "1415": 33307797504.0, - "1420": 33306605568.0, - "1425": 33307240448.0, - "1430": 33307322368.0, - "1435": 33307559936.0, - "1440": 33306662912.0, - "1445": 33307058176.0, - "1450": 33307705344.0, - "1455": 33307291648.0, - "1460": 33306861568.0, - "1465": 33306312704.0, - "1470": 33307394048.0, - "1475": 33307211776.0, - "1480": 33306527744.0, - "1485": 33307361280.0, - "1490": 33307693056.0, - "1495": 33307271168.0, - "1500": 33306820608.0, - "1505": 33307092992.0, - "1510": 33306624000.0, - "1515": 33307097088.0, - "1520": 33306931200.0, - "1525": 33307635712.0, - "1530": 33307353088.0, - "1535": 33306468352.0, - "1540": 33307172864.0, - "1545": 33307693056.0, - "1550": 33307938816.0, - "1555": 33307832320.0, - "1560": 33308182528.0, - "1565": 33307099136.0, - "1570": 33306798080.0, - "1575": 33307492352.0, - "1580": 33307688960.0, - "1585": 33307326464.0, - "1590": 33306988544.0, - "1595": 33306818560.0, - "1600": 33307836416.0, - "1605": 33307590656.0, - "1610": 33307168768.0, - "1615": 33306931200.0, - "1620": 33306732544.0, - "1625": 33308260352.0, - "1630": 33308227584.0, - "1635": 33306957824.0, - "1640": 33306759168.0, - "1645": 33306021888.0, - "1650": 33306689536.0, - "1655": 33307332608.0, - "1660": 33307170816.0, - "1665": 33306583040.0, - "1670": 33307535360.0, - "1675": 33306912768.0, - "1680": 33306675200.0, - "1685": 33307774976.0, - "1690": 33307783168.0, - "1695": 33307971584.0, - "1700": 33307623424.0, - "1705": 33307652096.0, - "1710": 33307731968.0, - "1715": 33308090368.0, - "1720": 33307172864.0, - "1725": 33307672576.0, - "1730": 33306355712.0, - "1735": 33308229632.0, - "1740": 33307142144.0, - "1745": 33308151808.0, - "1750": 33306898432.0, - "1755": 33307105280.0, - "1760": 33308000256.0, - "1765": 33307750400.0, - "1770": 33308450816.0, - "1775": 33308184576.0, - "1780": 33308129280.0, - "1785": 33307936768.0, - "1790": 33307238400.0, - "1795": 33307922432.0, - "1800": 33306900480.0, - "1805": 33307203584.0, - "1810": 33306923008.0, - "1815": 33307617280.0, - "1820": 33307664384.0, - "1825": 33308440576.0, - "1830": 33306843136.0, - "1835": 33307979776.0, - "1840": 33307588608.0, - "1845": 33307602944.0, - "1850": 33307774976.0, - "1855": 33307529216.0, - "1860": 33307054080.0, - "1865": 33307097088.0, - "1870": 33307373568.0, - "1875": 33306265600.0, - "1880": 33307275264.0, - "1885": 33307224064.0, - "1890": 33307324416.0, - "1895": 33307283456.0, - "1900": 33306810368.0, - "1905": 33307191296.0, - "1910": 33306884096.0, - "1915": 33308162048.0, - "1920": 33307664384.0, - "1925": 33305972736.0, - "1930": 33308504064.0, - "1935": 33307377664.0, - "1940": 33307119616.0, - "1945": 33307416576.0, - "1950": 33307746304.0, - "1955": 33307420672.0, - "1960": 33308073984.0, - "1965": 33307148288.0, - "1970": 33306775552.0, - "1975": 33308207104.0, - "1980": 33307473920.0, - "1985": 33307095040.0, - "1990": 33307527168.0, - "1995": 33307037696.0, - "2000": 33308801024.0, - "2005": 33307985920.0, - "2010": 33307516928.0, - "2015": 33307604992.0, - "2020": 33307406336.0, - "2025": 33307719680.0, - "2030": 33308381184.0, - "2035": 33307914240.0, - "2040": 33307324416.0, - "2045": 33306476544.0, - "2050": 33308246016.0, - "2055": 33307430912.0, - "2060": 33307912192.0, - "2065": 33307543552.0, - "2070": 33307670528.0, - "2075": 33307482112.0, - "2080": 33307871232.0, - "2085": 33306722304.0, - "2090": 33307549696.0, - "2095": 33307260928.0, - "2100": 33306765312.0, - "2105": 33306847232.0, - "2110": 33307332608.0, - "2115": 33306480640.0, - "2120": 33307168768.0, - "2125": 33307277312.0, - "2130": 33307314176.0, - "2135": 33307752448.0, - "2140": 33306710016.0, - "2145": 33307478016.0, - "2150": 33307729920.0, - "2155": 33306943488.0, - "2160": 33307508736.0, - "2165": 33307049984.0, - "2170": 33307158528.0, - "2175": 33306599424.0, - "2180": 33307054080.0, - "2185": 33307017216.0, - "2190": 33307119616.0, - "2195": 33307289600.0, - "2200": 33306726400.0, - "2205": 33306636288.0, - "2210": 33307639808.0, - "2215": 33308215296.0, - "2220": 33307314176.0, - "2225": 33307437056.0, - "2230": 33306318848.0, - "2235": 33306941440.0, - "2240": 33308131328.0, - "2245": 33307707392.0, - "2250": 33307256832.0, - "2255": 33306845184.0, - "2260": 33307736064.0, - "2265": 33308620800.0, - "2270": 33307357184.0, - "2275": 33308151808.0, - "2280": 33307981824.0, - "2285": 33307922432.0, - "2290": 33306767360.0, - "2295": 33307670528.0, - "2300": 33307179008.0, - "2305": 33307545600.0, - "2310": 33307924480.0, - "2315": 33307396096.0, - "2320": 33307725824.0, - "2325": 33308024832.0, - "2330": 33307793408.0, - "2335": 33307019264.0, - "2340": 33307162624.0, - "2345": 33307934720.0, - "2350": 33306232832.0, - "2355": 33307719680.0, - "2360": 33307375616.0, - "2365": 33306537984.0, - "2370": 33307279360.0, - "2375": 33308131328.0, - "2380": 33307136000.0, - "2385": 33307490304.0, - "2390": 33307316224.0, - "2395": 33306587136.0, - "2400": 33307594752.0, - "2405": 33308393472.0, - "2410": 33306726400.0, - "2415": 33307506688.0, - "2420": 33308407808.0, - "2425": 33307942912.0, - "2430": 33308116992.0, - "2435": 33307308032.0, - "2440": 33308362752.0, - "2445": 33308071936.0, - "2450": 33307740160.0, - "2455": 33307959296.0, - "2460": 33308258304.0, - "2465": 33307299840.0, - "2470": 33307056128.0, - "2475": 33307224064.0, - "2480": 33307713536.0, - "2485": 33306550272.0, - "2490": 33306992640.0, - "2495": 33307232256.0, - "2500": 33307095040.0, - "2505": 33307107328.0, - "2510": 33307488256.0, - "2515": 33308360704.0, - "2520": 33307369472.0, - "2525": 33306959872.0, - "2530": 33307258880.0, - "2535": 33307082752.0, - "2540": 33308633088.0, - "2545": 33308542976.0, - "2550": 33308002304.0, - "2555": 33307961344.0, - "2560": 33307328512.0, - "2565": 33308299264.0, - "2570": 33307770880.0, - "2575": 33307877376.0, - "2580": 33307990016.0, - "2585": 33308016640.0, - "2590": 33308135424.0, - "2595": 33307617280.0, - "2600": 33306667008.0, - "2605": 33307422720.0, - "2610": 33306683392.0, - "2615": 33308669952.0, - "2620": 33308616704.0, - "2625": 33308366848.0, - "2630": 33307574272.0, - "2635": 33308166144.0, - "2640": 33307983872.0, - "2645": 33307609088.0, - "2650": 33307807744.0, - "2655": 33306955776.0, - "2660": 33307273216.0, - "2665": 33307709440.0, - "2670": 33307693056.0, - "2675": 33307731968.0, - "2680": 33308227584.0, - "2685": 33307742208.0, - "2690": 33307734016.0, - "2695": 33307424768.0, - "2700": 33306644480.0, - "2705": 33306300416.0, - "2710": 33307881472.0, - "2715": 33307488256.0, - "2720": 33307318272.0, - "2725": 33307604992.0, - "2730": 33306710016.0, - "2735": 33308049408.0, - "2740": 33307437056.0, - "2745": 33307572224.0, - "2750": 33307136000.0, - "2755": 33307584512.0, - "2760": 33307355136.0, - "2765": 33307713536.0, - "2770": 33308000256.0, - "2775": 33306460160.0, - "2780": 33306923008.0, - "2785": 33307017216.0, - "2790": 33306720256.0, - "2795": 33307785216.0, - "2800": 33307234304.0, - "2805": 33306685440.0, - "2810": 33307469824.0, - "2815": 33308069888.0, - "2820": 33306460160.0, - "2825": 33307467776.0, - "2830": 33307666432.0, - "2835": 33307371520.0, - "2840": 33306904576.0, - "2845": 33308061696.0, - "2850": 33308520448.0, - "2855": 33307695104.0, - "2860": 33308487680.0, - "2865": 33307058176.0, - "2870": 33307303936.0, - "2875": 33307324416.0, - "2880": 33306968064.0, - "2885": 33307641856.0, - "2890": 33307785216.0, - "2895": 33308221440.0, - "2900": 33307596800.0, - "2905": 33307533312.0, - "2910": 33307459584.0, - "2915": 33307799552.0, - "2920": 33308461056.0, - "2925": 33307938816.0, - "2930": 33308268544.0, - "2935": 33308594176.0, - "2940": 33308170240.0, - "2945": 33307578368.0, - "2950": 33307590656.0, - "2955": 33308131328.0, - "2960": 33306839040.0, - "2965": 33307111424.0, - "2970": 33307570176.0, - "2975": 33307766784.0, - "2980": 33307600896.0, - "2985": 33307123712.0, - "2990": 33307641856.0, - "2995": 33307527168.0, - "3000": 33307863040.0, - "3005": 33306927104.0, - "3010": 33307738112.0, - "3015": 33308217344.0, - "3020": 33306697728.0, - "3025": 33306970112.0, - "3030": 33308127232.0, - "3035": 33308213248.0, - "3040": 33307578368.0, - "3045": 33308327936.0, - "3050": 33306910720.0, - "3055": 33307004928.0, - "3060": 33307602944.0, - "3065": 33306970112.0, - "3070": 33307985920.0, - "3075": 33306945536.0, - "3080": 33307312128.0, - "3085": 33306533888.0, - "3090": 33306933248.0, - "3095": 33307906048.0, - "3100": 33306793984.0, - "3105": 33307127808.0, - "3110": 33308295168.0, - "3115": 33307295744.0, - "3120": 33307897856.0, - "3125": 33307066368.0, - "3130": 33307781120.0, - "3135": 33307762688.0, - "3140": 33308196864.0, - "3145": 33306904576.0, - "3150": 33307140096.0, - "3155": 33306660864.0, - "3160": 33307514880.0, - "3165": 33307246592.0, - "3170": 33307613184.0, - "3175": 33307375616.0, - "3180": 33307551744.0, - "3185": 33307842560.0, - "3190": 33308342272.0, - "3195": 33308350464.0, - "3200": 33307799552.0, - "3205": 33307099136.0, - "3210": 33306869760.0, - "3215": 33307678720.0, - "3220": 33307111424.0, - "3225": 33307146240.0, - "3230": 33306972160.0, - "3235": 33307387904.0, - "3240": 33307521024.0, - "3245": 33307287552.0, - "3250": 33307523072.0, - "3255": 33307639808.0, - "3260": 33307092992.0, - "3265": 33308338176.0, - "3270": 33307273216.0, - "3275": 33307713536.0, - "3280": 33307719680.0, - "3285": 33308049408.0, - "3290": 33307484160.0, - "3295": 33307594752.0, - "3300": 33307228160.0, - "3305": 33306580992.0, - "3310": 33307541504.0, - "3315": 33307211776.0, - "3320": 33307324416.0, - "3325": 33306615808.0, - "3330": 33307777024.0, - "3335": 33308135424.0, - "3340": 33307351040.0, - "3345": 33307131904.0, - "3350": 33307031552.0, - "3355": 33307791360.0, - "3360": 33307410432.0, - "3365": 33307090944.0, - "3370": 33306187776.0, - "3375": 33307113472.0, - "3380": 33308071936.0, - "3385": 33307717632.0, - "3390": 33306648576.0, - "3395": 33306781696.0, - "3400": 33307734016.0, - "3405": 33307570176.0, - "3410": 33307750400.0, - "3415": 33307920384.0, - "3420": 33308157952.0, - "3425": 33307500544.0, - "3430": 33307168768.0, - "3435": 33307645952.0, - "3440": 33307185152.0, - "3445": 33307459584.0, - "3450": 33306804224.0, - "3455": 33307662336.0, - "3460": 33306748928.0, - "3465": 33306497024.0, - "3470": 33306796032.0, - "3475": 33307947008.0, - "3480": 33308039168.0, - "3485": 33307676672.0, - "3490": 33306728448.0, - "3495": 33307115520.0, - "3500": 33306628096.0, - "3505": 33307537408.0, - "3510": 33306945536.0, - "3515": 33306902528.0, - "3520": 33307553792.0, - "3525": 33307590656.0, - "3530": 33307852800.0, - "3535": 33306773504.0, - "3540": 33307953152.0, - "3545": 33307463680.0, - "3550": 33307123712.0, - "3555": 33307738112.0, - "3560": 33307766784.0, - "3565": 33307088896.0, - "3570": 33306882048.0, - "3575": 33307443200.0, - "3580": 33306951680.0, - "3585": 33306841088.0, - "3590": 33308293120.0, - "3595": 33307723776.0, - "3600": 33307756544.0, - "3605": 33307930624.0, - "3610": 33307985920.0, - "3615": 33307222016.0, - "3620": 33307430912.0, - "3625": 33307148288.0, - "3630": 33306388480.0, - "3635": 33307035648.0, - "3640": 33307455488.0, - "3645": 33306906624.0, - "3650": 33307545600.0, - "3655": 33307336704.0, - "3660": 33306910720.0, - "3665": 33307623424.0, - "3670": 33306824704.0, - "3675": 33307590656.0, - "3680": 33307373568.0, - "3685": 33306505216.0, - "3690": 33307817984.0, - "3695": 33306890240.0, - "3700": 33306802176.0, - "3705": 33306945536.0, - "3710": 33306904576.0, - "3715": 33307754496.0, - "3720": 33308395520.0, - "3725": 33308112896.0, - "3730": 33307652096.0, - "3735": 33307867136.0, - "3740": 33307805696.0, - "3745": 33308069888.0, - "3750": 33307826176.0, - "3755": 33306439680.0, - "3760": 33306849280.0, - "3765": 33307471872.0, - "3770": 33307095040.0, - "3775": 33307492352.0, - "3780": 33308141568.0, - "3785": 33307910144.0, - "3790": 33307656192.0, - "3795": 33307727872.0, - "3800": 33307246592.0, - "3805": 33307848704.0, - "3810": 33307490304.0, - "3815": 33307357184.0, - "3820": 33307346944.0, - "3825": 33307619328.0, - "3830": 33308102656.0, - "3835": 33306849280.0, - "3840": 33307678720.0, - "3845": 33307258880.0, - "3850": 33307686912.0, - "3855": 33307467776.0, - "3860": 33307471872.0, - "3865": 33307439104.0, - "3870": 33307676672.0, - "3875": 33306865664.0, - "3880": 33307232256.0, - "3885": 33307099136.0, - "3890": 33307854848.0, - "3895": 33306370048.0, - "3900": 33306900480.0, - "3905": 33306824704.0, - "3910": 33307361280.0, - "3915": 33306591232.0, - "3920": 33307213824.0, - "3925": 33306980352.0, - "3930": 33308110848.0, - "3935": 33307179008.0, - "3940": 33307379712.0, - "3945": 33307813888.0, - "3950": 33307277312.0, - "3955": 33307203584.0, - "3960": 33307234304.0, - "3965": 33307121664.0, - "3970": 33307303936.0, - "3975": 33307144192.0, - "3980": 33307869184.0, - "3985": 33307660288.0, - "3990": 33307779072.0, - "3995": 33307795456.0, - "4000": 33307131904.0, - "4005": 33307238400.0, - "4010": 33307875328.0, - "4015": 33306726400.0, - "4020": 33308227584.0, - "4025": 33307799552.0, - "4030": 33307318272.0, - "4035": 33308190720.0, - "4040": 33307932672.0, - "4045": 33307291648.0, - "4050": 33307959296.0, - "4055": 33307447296.0, - "4060": 33307486208.0, - "4065": 33308088320.0, - "4070": 33307183104.0, - "4075": 33307201536.0, - "4080": 33308184576.0, - "4085": 33306406912.0, - "4090": 33307891712.0, - "4095": 33307031552.0, - "4100": 33308100608.0, - "4105": 33307258880.0, - "4110": 33307492352.0, - "4115": 33308344320.0, - "4120": 33306552320.0, - "4125": 33307611136.0, - "4130": 33306083328.0, - "4135": 33308463104.0, - "4140": 33307611136.0, - "4145": 33307455488.0, - "4150": 33307658240.0, - "4155": 33307133952.0, - "4160": 33308233728.0, - "4165": 33307408384.0, - "4170": 33306888192.0, - "4175": 33307852800.0, - "4180": 33307150336.0, - "4185": 33307127808.0, - "4190": 33307582464.0, - "4195": 33308610560.0, - "4200": 33308231680.0, - "4205": 33307906048.0, - "4210": 33308307456.0, - "4215": 33306363904.0, - "4220": 33306980352.0, - "4225": 33306318848.0, - "4230": 33307731968.0, - "4235": 33307142144.0, - "4240": 33307432960.0, - "4245": 33307097088.0, - "4250": 33307783168.0, - "4255": 33307365376.0, - "4260": 33306947584.0, - "4265": 33306611712.0, - "4270": 33306347520.0, - "4275": 33306624000.0, - "4280": 33307185152.0, - "4285": 33307922432.0, - "4290": 33307508736.0, - "4295": 33307658240.0, - "4300": 33308405760.0, - "4305": 33306474496.0, - "4310": 33307557888.0, - "4315": 33308307456.0, - "4320": 33307719680.0, - "4325": 33306824704.0, - "4330": 33307594752.0, - "4335": 33306144768.0, - "4340": 33307852800.0, - "4345": 33307342848.0, - "4350": 33308139520.0, - "4355": 33307713536.0, - "4360": 33307373568.0, - "4365": 33308065792.0, - "4370": 33306681344.0, - "4375": 33307770880.0, - "4380": 33307361280.0, - "4385": 33307086848.0, - "4390": 33307019264.0, - "4395": 33306986496.0, - "4400": 33307103232.0, - "4405": 33307664384.0, - "4410": 33307996160.0, - "4415": 33306990592.0, - "4420": 33306546176.0, - "4425": 33306904576.0, - "4430": 33307303936.0, - "4435": 33306763264.0, - "4440": 33308063744.0, - "4445": 33307242496.0, - "4450": 33307283456.0, - "4455": 33306654720.0, - "4460": 33307205632.0, - "4465": 33306867712.0, - "4470": 33307916288.0, - "4475": 33307791360.0, - "4480": 33308450816.0, - "4485": 33307547648.0, - "4490": 33307090944.0, - "4495": 33307000832.0, - "4500": 33306935296.0, - "4505": 33307099136.0, - "4510": 33307525120.0, - "4515": 33307367424.0, - "4520": 33307813888.0, - "4525": 33307715584.0, - "4530": 33307901952.0, - "4535": 33307174912.0, - "4540": 33306880000.0, - "4545": 33307138048.0, - "4550": 33306873856.0, - "4555": 33306316800.0, - "4560": 33305849856.0, - "4565": 33307187200.0, - "4570": 33307260928.0, - "4575": 33307410432.0, - "4580": 33307201536.0, - "4585": 33306920960.0, - "4590": 33307355136.0, - "4595": 33307346944.0, - "4600": 33307856896.0, - "4605": 33307752448.0, - "4610": 33307095040.0, - "4615": 33306286080.0, - "4620": 33306699776.0, - "4625": 33308069888.0, - "4630": 33307439104.0, - "4635": 33306900480.0, - "4640": 33307076608.0, - "4645": 33308160000.0, - "4650": 33307758592.0, - "4655": 33307865088.0, - "4660": 33306255360.0, - "4665": 33307641856.0, - "4670": 33307912192.0, - "4675": 33306603520.0, - "4680": 33307799552.0, - "4685": 33307488256.0, - "4690": 33307394048.0, - "4695": 33306763264.0, - "4700": 33307873280.0, - "4705": 33308106752.0, - "4710": 33307617280.0, - "4715": 33307047936.0, - "4720": 33307901952.0, - "4725": 33307793408.0, - "4730": 33308123136.0, - "4735": 33307451392.0, - "4740": 33307623424.0, - "4745": 33306857472.0, - "4750": 33308436480.0, - "4755": 33307260928.0, - "4760": 33307975680.0, - "4765": 33307965440.0, - "4770": 33306859520.0, - "4775": 33307922432.0, - "4780": 33306978304.0, - "4785": 33306869760.0, - "4790": 33307084800.0, - "4795": 33307226112.0, - "4800": 33307961344.0, - "4805": 33308334080.0, - "4810": 33305587712.0, - "4815": 33307928576.0, - "4820": 33307875328.0, - "4825": 33306957824.0, - "4830": 33307797504.0, - "4835": 33306116096.0, - "4840": 33307654144.0, - "4845": 33307131904.0, - "4850": 33308055552.0, - "4855": 33305792512.0, - "4860": 33307402240.0, - "4865": 33307086848.0, - "4870": 33307637760.0, - "4875": 33307789312.0, - "4880": 33307701248.0, - "4885": 33308010496.0, - "4890": 33307039744.0, - "4895": 33307369472.0, - "4900": 33307127808.0, - "4905": 33306988544.0, - "4910": 33308276736.0, - "4915": 33307090944.0, - "4920": 33307015168.0, - "4925": 33308043264.0, - "4930": 33307607040.0, - "4935": 33308209152.0, - "4940": 33307725824.0, - "4945": 33307985920.0, - "4950": 33307582464.0, - "4955": 33307297792.0, - "4960": 33307639808.0, - "4965": 33307445248.0, - "4970": 33306869760.0, - "4975": 33306787840.0, - "4980": 33307099136.0, - "4985": 33307635712.0, - "4990": 33307406336.0, - "4995": 33307471872.0, - "5000": 33307375616.0, - "5005": 33307672576.0, - "5010": 33306970112.0, - "5015": 33307244544.0, - "5020": 33306966016.0, - "5025": 33307705344.0, - "5030": 33307463680.0, - "5035": 33306818560.0, - "5040": 33306972160.0, - "5045": 33308157952.0, - "5050": 33306376192.0, - "5055": 33307594752.0, - "5060": 33308471296.0, - "5065": 33307455488.0, - "5070": 33307301888.0, - "5075": 33307488256.0, - "5080": 33307910144.0, - "5085": 33307635712.0, - "5090": 33307406336.0, - "5095": 33307254784.0, - "5100": 33306828800.0, - "5105": 33307852800.0, - "5110": 33308258304.0, - "5115": 33307228160.0, - "5120": 33307955200.0, - "5125": 33305640960.0, - "5130": 33306683392.0, - "5135": 33307336704.0, - "5140": 33307834368.0, - "5145": 33307060224.0, - "5150": 33307023360.0, - "5155": 33307308032.0, - "5160": 33306664960.0, - "5165": 33307123712.0, - "5170": 33306935296.0, - "5175": 33308094464.0, - "5180": 33306566656.0, - "5185": 33306796032.0, - "5190": 33307545600.0, - "5195": 33308067840.0, - "5200": 33307754496.0, - "5205": 33307445248.0, - "5210": 33306785792.0, - "5215": 33307551744.0, - "5220": 33308188672.0, - "5225": 33307338752.0, - "5230": 33307283456.0, - "5235": 33306976256.0, - "5240": 33308041216.0, - "5245": 33308340224.0, - "5250": 33308153856.0, - "5255": 33307590656.0, - "5260": 33306896384.0, - "5265": 33308303360.0, - "5270": 33308796928.0, - "5275": 33307949056.0, - "5280": 33306157056.0, - "5285": 33307904000.0, - "5290": 33308143616.0, - "5295": 33306533888.0, - "5300": 33307912192.0, - "5305": 33308338176.0, - "5310": 33308688384.0, - "5315": 33308045312.0, - "5320": 33306206208.0, - "5325": 33308219392.0, - "5330": 33308012544.0, - "5335": 33307602944.0, - "5340": 33306685440.0, - "5345": 33308209152.0, - "5350": 33307150336.0, - "5355": 33308176384.0, - "5360": 33307273216.0, - "5365": 33307850752.0, - "5370": 33307222016.0, - "5375": 33307803648.0, - "5380": 33307617280.0, - "5385": 33307179008.0, - "5390": 33307389952.0, - "5395": 33306927104.0, - "5400": 33307518976.0, - "5405": 33307400192.0, - "5410": 33307598848.0, - "5415": 33307846656.0, - "5420": 33307490304.0, - "5425": 33307459584.0, - "5430": 33307283456.0, - "5435": 33307453440.0, - "5440": 33307383808.0, - "5445": 33307117568.0, - "5450": 33307832320.0, - "5455": 33307582464.0, - "5460": 33306963968.0, - "5465": 33306947584.0, - "5470": 33307355136.0, - "5475": 33306748928.0, - "5480": 33306435584.0, - "5485": 33307590656.0, - "5490": 33307787264.0, - "5495": 33307568128.0, - "5500": 33307351040.0, - "5505": 33307568128.0, - "5510": 33307426816.0, - "5515": 33307451392.0, - "5520": 33307549696.0, - "5525": 33307000832.0, - "5530": 33307566080.0, - "5535": 33307664384.0, - "5540": 33306966016.0, - "5545": 33307781120.0, - "5550": 33307275264.0, - "5555": 33307269120.0, - "5560": 33307576320.0, - "5565": 33307377664.0, - "5570": 33307052032.0, - "5575": 33306978304.0, - "5580": 33307965440.0, - "5585": 33307494400.0, - "5590": 33308055552.0, - "5595": 33306943488.0, - "5600": 33306542080.0, - "5605": 33307680768.0, - "5610": 33308542976.0, - "5615": 33307826176.0, - "5620": 33308108800.0, - "5625": 33308225536.0, - "5630": 33308069888.0, - "5635": 33307760640.0, - "5640": 33307500544.0, - "5645": 33307930624.0, - "5650": 33306755072.0, - "5655": 33308192768.0, - "5660": 33308631040.0, - "5665": 33307418624.0, - "5670": 33307504640.0, - "5675": 33307715584.0, - "5680": 33307910144.0, - "5685": 33307996160.0, - "5690": 33307478016.0, - "5695": 33308164096.0, - "5700": 33307906048.0, - "5705": 33307750400.0, - "5710": 33306779648.0, - "5715": 33307219968.0, - "5720": 33307750400.0, - "5725": 33307537408.0, - "5730": 33307262976.0, - "5735": 33306767360.0, - "5740": 33307508736.0, - "5745": 33306753024.0, - "5750": 33306636288.0, - "5755": 33306943488.0, - "5760": 33307553792.0, - "5765": 33307842560.0, - "5770": 33307047936.0, - "5775": 33307348992.0, - "5780": 33306361856.0, - "5785": 33307709440.0, - "5790": 33307832320.0, - "5795": 33307406336.0, - "5800": 33307056128.0, - "5805": 33307631616.0, - "5810": 33307766784.0, - "5815": 33307971584.0, - "5820": 33307447296.0, - "5825": 33307084800.0, - "5830": 33307324416.0, - "5835": 33307127808.0, - "5840": 33307729920.0, - "5845": 33307088896.0, - "5850": 33307635712.0, - "5855": 33307119616.0, - "5860": 33306703872.0, - "5865": 33307291648.0, - "5870": 33307613184.0, - "5875": 33307893760.0, - "5880": 33307893760.0, - "5885": 33307301888.0, - "5890": 33307830272.0, - "5895": 33306671104.0, - "5900": 33306488832.0, - "5905": 33308141568.0, - "5910": 33307373568.0, - "5915": 33307330560.0, - "5920": 33307656192.0, - "5925": 33307533312.0, - "5930": 33307848704.0, - "5935": 33307586560.0, - "5940": 33307602944.0, - "5945": 33307631616.0, - "5950": 33306615808.0, - "5955": 33307719680.0, - "5960": 33308553216.0, - "5965": 33308676096.0, - "5970": 33308313600.0, - "5975": 33306810368.0, - "5980": 33307222016.0, - "5985": 33307367424.0, - "5990": 33307119616.0, - "5995": 33307166720.0, - "6000": 33307822080.0, - "6005": 33307553792.0, - "6010": 33307756544.0, - "6015": 33306392576.0, - "6020": 33308116992.0, - "6025": 33307738112.0, - "6030": 33307459584.0, - "6035": 33306920960.0, - "6040": 33307701248.0, - "6045": 33307932672.0, - "6050": 33307496448.0, - "6055": 33307133952.0, - "6060": 33306370048.0, - "6065": 33307521024.0, - "6070": 33307244544.0, - "6075": 33306447872.0, - "6080": 33306963968.0, - "6085": 33307932672.0, - "6090": 33307293696.0, - "6095": 33307058176.0, - "6100": 33307449344.0, - "6105": 33307613184.0, - "6110": 33307779072.0, - "6115": 33306832896.0, - "6120": 33306732544.0, - "6125": 33306488832.0, - "6130": 33308866560.0, - "6135": 33308000256.0, - "6140": 33307906048.0, - "6145": 33308504064.0, - "6150": 33307826176.0, - "6155": 33306906624.0, - "6160": 33307533312.0, - "6165": 33307578368.0, - "6170": 33307891712.0, - "6175": 33307537408.0, - "6180": 33307803648.0, - "6185": 33308125184.0, - "6190": 33307342848.0, - "6195": 33308135424.0, - "6200": 33306468352.0, - "6205": 33308026880.0, - "6210": 33308028928.0, - "6215": 33308157952.0, - "6220": 33307662336.0, - "6225": 33307344896.0, - "6230": 33308231680.0, - "6235": 33307148288.0, - "6240": 33308809216.0, - "6245": 33307017216.0, - "6250": 33307234304.0, - "6255": 33308430336.0, - "6260": 33307246592.0, - "6265": 33307418624.0, - "6270": 33308319744.0, - "6275": 33307090944.0, - "6280": 33307404288.0, - "6285": 33308227584.0, - "6290": 33307656192.0, - "6295": 33306865664.0, - "6300": 33307596800.0, - "6305": 33308192768.0, - "6310": 33307695104.0, - "6315": 33307361280.0, - "6320": 33306775552.0, - "6325": 33307557888.0, - "6330": 33307639808.0, - "6335": 33307820032.0, - "6340": 33307410432.0, - "6345": 33307410432.0, - "6350": 33308256256.0, - "6355": 33307082752.0, - "6360": 33306855424.0, - "6365": 33307418624.0, - "6370": 33307066368.0, - "6375": 33307891712.0, - "6380": 33307779072.0, - "6385": 33306128384.0, - "6390": 33306884096.0, - "6395": 33307060224.0, - "6400": 33307250688.0, - "6405": 33308135424.0, - "6410": 33308155904.0, - "6415": 33307101184.0, - "6420": 33306318848.0, - "6425": 33308065792.0, - "6430": 33307813888.0, - "6435": 33307842560.0, - "6440": 33308571648.0, - "6445": 33306138624.0, - "6450": 33307762688.0, - "6455": 33308119040.0, - "6460": 33308037120.0, - "6465": 33308467200.0, - "6470": 33307181056.0, - "6475": 33307246592.0, - "6480": 33306855424.0, - "6485": 33308440576.0, - "6490": 33307863040.0, - "6495": 33306857472.0, - "6500": 33306529792.0, - "6505": 33307097088.0, - "6510": 33307842560.0, - "6515": 33307095040.0, - "6520": 33307848704.0, - "6525": 33307596800.0, - "6530": 33307117568.0, - "6535": 33307811840.0, - "6540": 33307645952.0, - "6545": 33307211776.0, - "6550": 33308196864.0, - "6555": 33307213824.0, - "6560": 33307326464.0, - "6565": 33306490880.0, - "6570": 33306877952.0, - "6575": 33307199488.0, - "6580": 33308370944.0, - "6585": 33307828224.0, - "6590": 33307871232.0, - "6595": 33307590656.0, - "6600": 33306578944.0, - "6605": 33307496448.0, - "6610": 33307912192.0, - "6615": 33307521024.0, - "6620": 33307189248.0, - "6625": 33306961920.0, - "6630": 33306800128.0, - "6635": 33306957824.0, - "6640": 33307762688.0, - "6645": 33306427392.0, - "6650": 33307672576.0, - "6655": 33305133056.0, - "6660": 33307598848.0, - "6665": 33306884096.0, - "6670": 33307500544.0, - "6675": 33307592704.0, - "6680": 33306923008.0, - "6685": 33307084800.0, - "6690": 33307402240.0, - "6695": 33307963392.0, - "6700": 33307336704.0, - "6705": 33306845184.0, - "6710": 33307230208.0, - "6715": 33306310656.0, - "6720": 33307834368.0, - "6725": 33308094464.0, - "6730": 33308327936.0, - "6735": 33308092416.0, - "6740": 33306873856.0, - "6745": 33308082176.0, - "6750": 33306112000.0, - "6755": 33306810368.0, - "6760": 33307394048.0, - "6765": 33307414528.0, - "6770": 33308286976.0, - "6775": 33308618752.0, - "6780": 33306904576.0, - "6785": 33308182528.0, - "6790": 33308057600.0, - "6795": 33307049984.0, - "6800": 33306744832.0, - "6805": 33307242496.0, - "6810": 33307176960.0, - "6815": 33307779072.0, - "6820": 33306849280.0, - "6825": 33307623424.0, - "6830": 33307887616.0, - "6835": 33307670528.0, - "6840": 33308348416.0, - "6845": 33308184576.0, - "6850": 33307727872.0, - "6855": 33307252736.0, - "6860": 33307680768.0, - "6865": 33306963968.0, - "6870": 33307099136.0, - "6875": 33307037696.0, - "6880": 33307635712.0, - "6885": 33307615232.0, - "6890": 33307652096.0, - "6895": 33307369472.0, - "6900": 33307947008.0, - "6905": 33307334656.0, - "6910": 33306824704.0, - "6915": 33307537408.0, - "6920": 33306619904.0, - "6925": 33306408960.0, - "6930": 33306765312.0, - "6935": 33306609664.0, - "6940": 33307623424.0, - "6945": 33307160576.0, - "6950": 33307463680.0, - "6955": 33306507264.0, - "6960": 33307185152.0, - "6965": 33307019264.0, - "6970": 33307598848.0, - "6975": 33307435008.0, - "6980": 33307238400.0, - "6985": 33306222592.0, - "6990": 33308581888.0, - "6995": 33307254784.0, - "7000": 33308035072.0, - "7005": 33308233728.0, - "7010": 33307092992.0, - "7015": 33307193344.0, - "7020": 33307643904.0, - "7025": 33308274688.0, - "7030": 33307019264.0, - "7035": 33308454912.0, - "7040": 33308086272.0, - "7045": 33307277312.0, - "7050": 33307172864.0, - "7055": 33306599424.0, - "7060": 33307613184.0, - "7065": 33307031552.0, - "7070": 33306243072.0, - "7075": 33308037120.0, - "7080": 33306759168.0, - "7085": 33308033024.0, - "7090": 33307971584.0, - "7095": 33306873856.0, - "7100": 33308522496.0, - "7105": 33307363328.0, - "7110": 33308063744.0, - "7115": 33307770880.0, - "7120": 33307906048.0, - "7125": 33307443200.0, - "7130": 33307574272.0, - "7135": 33307541504.0, - "7140": 33306765312.0, - "7145": 33307854848.0, - "7150": 33306853376.0, - "7155": 33307856896.0, - "7160": 33307906048.0, - "7165": 33308184576.0, - "7170": 33308272640.0, - "7175": 33306417152.0, - "7180": 33307107328.0, - "7185": 33307860992.0, - "7190": 33307078656.0, - "7195": 33307494400.0, - "7200": 33307613184.0, - "7205": 33307680768.0, - "7210": 33307990016.0, - "7215": 33306822656.0, - "7220": 33306730496.0, - "7225": 33307539456.0, - "7230": 33307744256.0, - "7235": 33306136576.0, - "7240": 33307189248.0, - "7245": 33307236352.0, - "7250": 33306980352.0, - "7255": 33307832320.0, - "7260": 33307426816.0, - "7265": 33307340800.0, - "7270": 33307844608.0, - "7275": 33308094464.0, - "7280": 33308602368.0, - "7285": 33307498496.0, - "7290": 33307920384.0, - "7295": 33307426816.0, - "7300": 33306392576.0, - "7305": 33306718208.0, - "7310": 33307260928.0, - "7315": 33307527168.0, - "7320": 33306963968.0, - "7325": 33308188672.0, - "7330": 33307799552.0, - "7335": 33307717632.0, - "7340": 33307238400.0, - "7345": 33307365376.0, - "7350": 33307314176.0, - "7355": 33307940864.0, - "7360": 33306284032.0, - "7365": 33307893760.0, - "7370": 33306275840.0, - "7375": 33307873280.0, - "7380": 33309245440.0, - "7385": 33306730496.0, - "7390": 33307758592.0, - "7395": 33306609664.0, - "7400": 33307652096.0, - "7405": 33306427392.0, - "7410": 33308524544.0, - "7415": 33307961344.0, - "7420": 33307242496.0, - "7425": 33307811840.0, - "7430": 33307119616.0, - "7435": 33307428864.0, - "7440": 33307709440.0, - "7445": 33308342272.0, - "7450": 33306980352.0, - "7455": 33307351040.0, - "7460": 33306730496.0, - "7465": 33306537984.0, - "7470": 33307664384.0, - "7475": 33308037120.0, - "7480": 33307179008.0, - "7485": 33308467200.0, - "7490": 33307822080.0, - "7495": 33306638336.0, - "7500": 33306689536.0, - "7505": 33307717632.0, - "7510": 33306789888.0, - "7515": 33307518976.0, - "7520": 33307260928.0, - "7525": 33307676672.0, - "7530": 33306916864.0, - "7535": 33306996736.0, - "7540": 33306566656.0, - "7545": 33306720256.0, - "7550": 33307584512.0, - "7555": 33307471872.0, - "7560": 33306736640.0, - "7565": 33306292224.0, - "7570": 33307066368.0, - "7575": 33306871808.0, - "7580": 33307324416.0, - "7585": 33307115520.0, - "7590": 33306341376.0, - "7595": 33307744256.0, - "7600": 33307482112.0, - "7605": 33308149760.0, - "7610": 33307525120.0, - "7615": 33307656192.0, - "7620": 33307224064.0, - "7625": 33307158528.0, - "7630": 33307742208.0, - "7635": 33308012544.0, - "7640": 33307049984.0, - "7645": 33308631040.0, - "7650": 33307865088.0, - "7655": 33308229632.0, - "7660": 33307043840.0, - "7665": 33307037696.0, - "7670": 33306791936.0, - "7675": 33307320320.0, - "7680": 33307293696.0, - "7685": 33307432960.0, - "7690": 33307103232.0, - "7695": 33307568128.0, - "7700": 33306312704.0, - "7705": 33307795456.0, - "7710": 33307996160.0, - "7715": 33307133952.0, - "7720": 33308164096.0, - "7725": 33307254784.0, - "7730": 33307830272.0, - "7735": 33307721728.0, - "7740": 33307492352.0, - "7745": 33307783168.0, - "7750": 33306728448.0, - "7755": 33307734016.0, - "7760": 33308614656.0, - "7765": 33306791936.0, - "7770": 33308278784.0, - "7775": 33307873280.0, - "7780": 33307078656.0, - "7785": 33306990592.0, - "7790": 33307062272.0, - "7795": 33307680768.0, - "7800": 33306982400.0, - "7805": 33308090368.0, - "7810": 33307308032.0, - "7815": 33307078656.0, - "7820": 33307951104.0, - "7825": 33306480640.0, - "7830": 33307258880.0, - "7835": 33307891712.0, - "7840": 33307432960.0, - "7845": 33307066368.0, - "7850": 33306910720.0, - "7855": 33307938816.0, - "7860": 33307308032.0, - "7865": 33308264448.0, - "7870": 33307729920.0, - "7875": 33308129280.0, - "7880": 33308352512.0, - "7885": 33307398144.0, - "7890": 33306920960.0, - "7895": 33307156480.0, - "7900": 33308221440.0, - "7905": 33308047360.0, - "7910": 33306146816.0, - "7915": 33306910720.0, - "7920": 33307090944.0, - "7925": 33308264448.0, - "7930": 33307908096.0, - "7935": 33307465728.0, - "7940": 33307375616.0, - "7945": 33307848704.0, - "7950": 33308090368.0, - "7955": 33307043840.0, - "7960": 33307168768.0, - "7965": 33307846656.0, - "7970": 33306454016.0, - "7975": 33307635712.0, - "7980": 33307555840.0, - "7985": 33307131904.0, - "7990": 33306732544.0, - "7995": 33307430912.0, - "8000": 33307674624.0, - "8005": 33307746304.0, - "8010": 33308002304.0, - "8015": 33306906624.0, - "8020": 33307895808.0, - "8025": 33308231680.0, - "8030": 33307664384.0, - "8035": 33306888192.0, - "8040": 33308024832.0, - "8045": 33307693056.0, - "8050": 33306583040.0, - "8055": 33307201536.0, - "8060": 33307594752.0, - "8065": 33308260352.0, - "8070": 33307426816.0, - "8075": 33308108800.0, - "8080": 33308178432.0, - "8085": 33307308032.0, - "8090": 33306513408.0, - "8095": 33306968064.0, - "8100": 33308413952.0, - "8105": 33308241920.0, - "8110": 33307471872.0, - "8115": 33307832320.0, - "8120": 33307193344.0, - "8125": 33307295744.0, - "8130": 33306775552.0, - "8135": 33307097088.0, - "8140": 33307865088.0, - "8145": 33306746880.0, - "8150": 33307023360.0, - "8155": 33306806272.0, - "8160": 33307373568.0, - "8165": 33307631616.0, - "8170": 33306769408.0, - "8175": 33308239872.0, - "8180": 33307240448.0, - "8185": 33307471872.0, - "8190": 33308184576.0, - "8195": 33307754496.0, - "8200": 33307459584.0, - "8205": 33307850752.0, - "8210": 33306810368.0, - "8215": 33306222592.0, - "8220": 33307795456.0, - "8225": 33308078080.0, - "8230": 33306132480.0, - "8235": 33308764160.0, - "8240": 33307432960.0, - "8245": 33307867136.0, - "8250": 33308260352.0, - "8255": 33308334080.0, - "8260": 33308233728.0, - "8265": 33308528640.0, - "8270": 33307699200.0, - "8275": 33306748928.0, - "8280": 33307635712.0, - "8285": 33308008448.0, - "8290": 33307590656.0, - "8295": 33308041216.0, - "8300": 33307516928.0, - "8305": 33307879424.0, - "8310": 33307576320.0, - "8315": 33308366848.0, - "8320": 33307496448.0, - "8325": 33307256832.0, - "8330": 33307680768.0, - "8335": 33306669056.0, - "8340": 33306990592.0, - "8345": 33307936768.0, - "8350": 33307955200.0, - "8355": 33307791360.0, - "8360": 33306640384.0, - "8365": 33307586560.0, - "8370": 33307648000.0, - "8375": 33306890240.0, - "8380": 33307764736.0, - "8385": 33307871232.0, - "8390": 33307023360.0, - "8395": 33307664384.0, - "8400": 33307510784.0, - "8405": 33307338752.0, - "8410": 33307316224.0, - "8415": 33307566080.0, - "8420": 33307891712.0, - "8425": 33307676672.0, - "8430": 33307693056.0, - "8435": 33306812416.0, - "8440": 33307762688.0, - "8445": 33307447296.0, - "8450": 33307426816.0, - "8455": 33306660864.0, - "8460": 33307385856.0, - "8465": 33308121088.0, - "8470": 33307664384.0, - "8475": 33307023360.0, - "8480": 33308082176.0, - "8485": 33307346944.0, - "8490": 33307471872.0, - "8495": 33307889664.0, - "8500": 33307492352.0, - "8505": 33307502592.0, - "8510": 33307815936.0, - "8515": 33307983872.0, - "8520": 33306431488.0, - "8525": 33306537984.0, - "8530": 33307199488.0, - "8535": 33307848704.0, - "8540": 33307459584.0, - "8545": 33307432960.0, - "8550": 33307600896.0, - "8555": 33308553216.0, - "8560": 33307701248.0, - "8565": 33307799552.0, - "8570": 33307934720.0, - "8575": 33306324992.0, - "8580": 33307648000.0, - "8585": 33307951104.0, - "8590": 33308108800.0, - "8595": 33308037120.0, - "8600": 33308182528.0, - "8605": 33307410432.0, - "8610": 33308102656.0, - "8615": 33307342848.0, - "8620": 33306077184.0, - "8625": 33308153856.0, - "8630": 33307807744.0, - "8635": 33306734592.0, - "8640": 33307867136.0, - "8645": 33307129856.0, - "8650": 33307430912.0, - "8655": 33307545600.0, - "8660": 33307975680.0, - "8665": 33307822080.0, - "8670": 33307156480.0, - "8675": 33307758592.0, - "8680": 33308340224.0, - "8685": 33307357184.0, - "8690": 33308479488.0, - "8695": 33306523648.0, - "8700": 33307404288.0, - "8705": 33307791360.0, - "8710": 33308004352.0, - "8715": 33308108800.0, - "8720": 33307424768.0, - "8725": 33307564032.0, - "8730": 33306877952.0, - "8735": 33307199488.0, - "8740": 33307734016.0, - "8745": 33307248640.0, - "8750": 33307912192.0, - "8755": 33307215872.0, - "8760": 33308012544.0, - "8765": 33306640384.0, - "8770": 33307977728.0, - "8775": 33306624000.0, - "8780": 33307357184.0, - "8785": 33306353664.0, - "8790": 33307518976.0, - "8795": 33308178432.0, - "8800": 33307113472.0, - "8805": 33307045888.0, - "8810": 33307252736.0, - "8815": 33307430912.0, - "8820": 33307568128.0, - "8825": 33306791936.0, - "8830": 33307529216.0, - "8835": 33306691584.0, - "8840": 33306529792.0, - "8845": 33307303936.0, - "8850": 33307901952.0, - "8855": 33308196864.0, - "8860": 33307965440.0, - "8865": 33307971584.0, - "8870": 33306595328.0, - "8875": 33306419200.0, - "8880": 33307508736.0, - "8885": 33306345472.0, - "8890": 33307373568.0, - "8895": 33307631616.0, - "8900": 33307330560.0, - "8905": 33308209152.0, - "8910": 33308155904.0, - "8915": 33306943488.0, - "8920": 33307381760.0, - "8925": 33307437056.0, - "8930": 33308041216.0, - "8935": 33307142144.0, - "8940": 33307768832.0, - "8945": 33308551168.0, - "8950": 33307682816.0, - "8955": 33307656192.0, - "8960": 33307787264.0, - "8965": 33306220544.0, - "8970": 33307693056.0, - "8975": 33307529216.0, - "8980": 33307027456.0, - "8985": 33308442624.0, - "8990": 33307588608.0, - "8995": 33308315648.0, - "9000": 33307787264.0, - "9005": 33307951104.0, - "9010": 33305649152.0, - "9015": 33307592704.0, - "9020": 33307033600.0, - "9025": 33307232256.0, - "9030": 33307793408.0, - "9035": 33307385856.0, - "9040": 33308012544.0, - "9045": 33307287552.0, - "9050": 33307701248.0, - "9055": 33306814464.0, - "9060": 33307975680.0, - "9065": 33307693056.0, - "9070": 33306888192.0, - "9075": 33307168768.0, - "9080": 33306818560.0, - "9085": 33307557888.0, - "9090": 33308200960.0, - "9095": 33306867712.0, - "9100": 33308563456.0, - "9105": 33306994688.0, - "9110": 33307004928.0, - "9115": 33307439104.0, - "9120": 33307340800.0, - "9125": 33307295744.0, - "9130": 33306771456.0, - "9135": 33307031552.0, - "9140": 33306497024.0, - "9145": 33307629568.0, - "9150": 33308002304.0, - "9155": 33307484160.0, - "9160": 33308100608.0, - "9165": 33307611136.0, - "9170": 33307897856.0, - "9175": 33307473920.0, - "9180": 33307977728.0, - "9185": 33307203584.0, - "9190": 33306693632.0, - "9195": 33306931200.0, - "9200": 33307779072.0, - "9205": 33307205632.0, - "9210": 33307637760.0, - "9215": 33307090944.0, - "9220": 33308454912.0, - "9225": 33307471872.0, - "9230": 33307322368.0, - "9235": 33307422720.0, - "9240": 33307242496.0, - "9245": 33308026880.0, - "9250": 33308203008.0, - "9255": 33307389952.0, - "9260": 33308825600.0, - "9265": 33306505216.0, - "9270": 33307426816.0, - "9275": 33307865088.0, - "9280": 33307435008.0, - "9285": 33307258880.0, - "9290": 33308000256.0, - "9295": 33307498496.0, - "9300": 33307301888.0, - "9305": 33307674624.0, - "9310": 33307031552.0, - "9315": 33306327040.0, - "9320": 33306834944.0, - "9325": 33307971584.0, - "9330": 33307910144.0, - "9335": 33307213824.0, - "9340": 33307385856.0, - "9345": 33307385856.0, - "9350": 33308127232.0, - "9355": 33306615808.0, - "9360": 33306697728.0, - "9365": 33307463680.0, - "9370": 33306355712.0, - "9375": 33307219968.0, - "9380": 33307224064.0, - "9385": 33308024832.0, - "9390": 33307830272.0, - "9395": 33307535360.0, - "9400": 33307031552.0, - "9405": 33307418624.0, - "9410": 33306822656.0, - "9415": 33307267072.0, - "9420": 33306994688.0, - "9425": 33306892288.0, - "9430": 33307199488.0, - "9435": 33306980352.0, - "9440": 33306451968.0, - "9445": 33308420096.0, - "9450": 33306755072.0, - "9455": 33306341376.0, - "9460": 33308131328.0, - "9465": 33307023360.0, - "9470": 33308307456.0, - "9475": 33308221440.0, - "9480": 33308037120.0, - "9485": 33308055552.0, - "9490": 33307908096.0, - "9495": 33306486784.0, - "9500": 33306490880.0, - "9505": 33307967488.0, - "9510": 33307125760.0, - "9515": 33307242496.0, - "9520": 33307670528.0, - "9525": 33307496448.0, - "9530": 33307731968.0, - "9535": 33307435008.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 36905754624.0, - "5": 45014786048.0, - "10": 45173362688.0, - "15": 45173362688.0, - "20": 45251878912.0, - "25": 45286207488.0, - "30": 45286207488.0, - "35": 45288939520.0, - "40": 45288939520.0, - "45": 45288939520.0, - "50": 45288939520.0, - "55": 45288939520.0, - "60": 45288939520.0, - "65": 45288939520.0, - "70": 45288939520.0, - "75": 45288939520.0, - "80": 45288939520.0, - "85": 45288939520.0, - "90": 45288939520.0, - "95": 45288939520.0, - "100": 45288939520.0, - "105": 45288939520.0, - "110": 45299392512.0, - "115": 45314936832.0, - "120": 45378736128.0, - "125": 45428596736.0, - "130": 45428596736.0, - "135": 45445640192.0, - "140": 45445640192.0, - "145": 45445640192.0, - "150": 45445640192.0, - "155": 45445640192.0, - "160": 45445640192.0, - "165": 45445640192.0, - "170": 45445640192.0, - "175": 45445640192.0, - "180": 45445640192.0, - "185": 45445640192.0, - "190": 45445640192.0, - "195": 45445640192.0, - "200": 45536641024.0, - "205": 45638885376.0, - "210": 45638885376.0, - "215": 45638885376.0, - "220": 45638885376.0, - "225": 45638885376.0, - "230": 45638885376.0, - "235": 45713887232.0, - "240": 45932376064.0, - "245": 45982269440.0, - "250": 45982269440.0, - "255": 45982269440.0, - "260": 46039670784.0, - "265": 46039670784.0, - "270": 46039670784.0, - "275": 46039670784.0, - "280": 46293884928.0, - "285": 46293884928.0, - "290": 46293884928.0, - "295": 46293884928.0, - "300": 46293884928.0, - "305": 46319267840.0, - "310": 46319267840.0, - "315": 46319267840.0, - "320": 46319267840.0, - "325": 46319267840.0, - "330": 46319267840.0, - "335": 46319267840.0, - "340": 46319267840.0, - "345": 46451261440.0, - "350": 46451261440.0, - "355": 46451261440.0, - "360": 46451261440.0, - "365": 46451261440.0, - "370": 46451261440.0, - "375": 46451261440.0, - "380": 46451261440.0, - "385": 46451261440.0, - "390": 46451261440.0, - "395": 46451261440.0, - "400": 46451261440.0, - "405": 46451261440.0, - "410": 46451261440.0, - "415": 46451261440.0, - "420": 46451261440.0, - "425": 46451261440.0, - "430": 46451261440.0, - "435": 46451261440.0, - "440": 46451261440.0, - "445": 46451261440.0, - "450": 46451261440.0, - "455": 46451261440.0, - "460": 46451261440.0, - "465": 46451261440.0, - "470": 46451261440.0, - "475": 46451261440.0, - "480": 46451261440.0, - "485": 46451261440.0, - "490": 46451261440.0, - "495": 46451261440.0, - "500": 46451261440.0, - "505": 46451261440.0, - "510": 46451261440.0, - "515": 46451261440.0, - "520": 46451261440.0, - "525": 46451261440.0, - "530": 46451261440.0, - "535": 46451261440.0, - "540": 46451261440.0, - "545": 46451261440.0, - "550": 46451261440.0, - "555": 46451261440.0, - "560": 46451261440.0, - "565": 46451261440.0, - "570": 46451261440.0, - "575": 46451261440.0, - "580": 46451261440.0, - "585": 46451261440.0, - "590": 46451261440.0, - "595": 46451261440.0, - "600": 46451261440.0, - "605": 46451261440.0, - "610": 46451261440.0, - "615": 46451261440.0, - "620": 46451261440.0, - "625": 46451261440.0, - "630": 46451261440.0, - "635": 46451261440.0, - "640": 46451261440.0, - "645": 46451261440.0, - "650": 46451261440.0, - "655": 46451261440.0, - "660": 46451261440.0, - "665": 46451261440.0, - "670": 46451261440.0, - "675": 46451261440.0, - "680": 46451261440.0, - "685": 46451261440.0, - "690": 46451261440.0, - "695": 46451261440.0, - "700": 46451261440.0, - "705": 46451261440.0, - "710": 46451261440.0, - "715": 46451261440.0, - "720": 46451261440.0, - "725": 46451261440.0, - "730": 46451261440.0, - "735": 46451261440.0, - "740": 46451261440.0, - "745": 46451261440.0, - "750": 46451261440.0, - "755": 46451261440.0, - "760": 46451261440.0, - "765": 46451261440.0, - "770": 46451261440.0, - "775": 46451261440.0, - "780": 46451261440.0, - "785": 46451261440.0, - "790": 46451261440.0, - "795": 46451261440.0, - "800": 46451261440.0, - "805": 46451261440.0, - "810": 46451261440.0, - "815": 46451261440.0, - "820": 46451261440.0, - "825": 46451261440.0, - "830": 46451261440.0, - "835": 46451261440.0, - "840": 46451261440.0, - "845": 46451261440.0, - "850": 46451261440.0, - "855": 46451261440.0, - "860": 46451261440.0, - "865": 46451261440.0, - "870": 46451261440.0, - "875": 46451261440.0, - "880": 46451261440.0, - "885": 46451261440.0, - "890": 46451261440.0, - "895": 46451261440.0, - "900": 46451261440.0, - "905": 46451261440.0, - "910": 46451261440.0, - "915": 46451261440.0, - "920": 46451261440.0, - "925": 46451261440.0, - "930": 46451261440.0, - "935": 46451261440.0, - "940": 46451261440.0, - "945": 46451261440.0, - "950": 46451261440.0, - "955": 46451261440.0, - "960": 45564735488.0, - "965": 45952081920.0, - "970": 45952081920.0, - "975": 46005657600.0, - "980": 46005657600.0, - "985": 46005657600.0, - "990": 46005657600.0, - "995": 46169923584.0, - "1000": 46169923584.0, - "1005": 46169923584.0, - "1010": 46169923584.0, - "1015": 46169923584.0, - "1020": 46169923584.0, - "1025": 46169923584.0, - "1030": 46169923584.0, - "1035": 46169923584.0, - "1040": 46169923584.0, - "1045": 46169923584.0, - "1050": 46169923584.0, - "1055": 46169923584.0, - "1060": 46169923584.0, - "1065": 46169923584.0, - "1070": 46169923584.0, - "1075": 46169923584.0, - "1080": 46169923584.0, - "1085": 46169923584.0, - "1090": 46169923584.0, - "1095": 46169923584.0, - "1100": 46169923584.0, - "1105": 46169923584.0, - "1110": 46169923584.0, - "1115": 46169923584.0, - "1120": 46169923584.0, - "1125": 46169923584.0, - "1130": 46169923584.0, - "1135": 46169923584.0, - "1140": 46169923584.0, - "1145": 46169923584.0, - "1150": 46169923584.0, - "1155": 46169923584.0, - "1160": 46169923584.0, - "1165": 46169923584.0, - "1170": 46169923584.0, - "1175": 46169923584.0, - "1180": 46192005120.0, - "1185": 46192005120.0, - "1190": 46192005120.0, - "1195": 46192005120.0, - "1200": 46192005120.0, - "1205": 46192005120.0, - "1210": 46192005120.0, - "1215": 46192005120.0, - "1220": 46192005120.0, - "1225": 46192005120.0, - "1230": 46192005120.0, - "1235": 46192005120.0, - "1240": 46192005120.0, - "1245": 46192005120.0, - "1250": 46192005120.0, - "1255": 46192005120.0, - "1260": 46192005120.0, - "1265": 46192005120.0, - "1270": 46192005120.0, - "1275": 46192005120.0, - "1280": 46192005120.0, - "1285": 46192005120.0, - "1290": 46192005120.0, - "1295": 46192005120.0, - "1300": 46192005120.0, - "1305": 46192005120.0, - "1310": 46192005120.0, - "1315": 46192005120.0, - "1320": 46192005120.0, - "1325": 46192005120.0, - "1330": 46192005120.0, - "1335": 46192005120.0, - "1340": 46192005120.0, - "1345": 46192005120.0, - "1350": 46192005120.0, - "1355": 46192005120.0, - "1360": 46192005120.0, - "1365": 46192005120.0, - "1370": 46192005120.0, - "1375": 46192005120.0, - "1380": 46192005120.0, - "1385": 46192005120.0, - "1390": 46192005120.0, - "1395": 46192005120.0, - "1400": 46192005120.0, - "1405": 46192005120.0, - "1410": 46192005120.0, - "1415": 46192005120.0, - "1420": 46192005120.0, - "1425": 46192005120.0, - "1430": 46192005120.0, - "1435": 46192005120.0, - "1440": 46192005120.0, - "1445": 46192005120.0, - "1450": 46192005120.0, - "1455": 46192005120.0, - "1460": 46192005120.0, - "1465": 46192005120.0, - "1470": 46192005120.0, - "1475": 46192005120.0, - "1480": 46192005120.0, - "1485": 46192005120.0, - "1490": 46192005120.0, - "1495": 46192005120.0, - "1500": 46192005120.0, - "1505": 46192005120.0, - "1510": 46192005120.0, - "1515": 46192005120.0, - "1520": 46192005120.0, - "1525": 46192005120.0, - "1530": 46192005120.0, - "1535": 46192005120.0, - "1540": 46192005120.0, - "1545": 46192005120.0, - "1550": 46260322304.0, - "1555": 46260322304.0, - "1560": 46260322304.0, - "1565": 46260322304.0, - "1570": 46260322304.0, - "1575": 46260322304.0, - "1580": 46260322304.0, - "1585": 46260322304.0, - "1590": 46260322304.0, - "1595": 46260322304.0, - "1600": 46260322304.0, - "1605": 46260322304.0, - "1610": 46260322304.0, - "1615": 46260322304.0, - "1620": 46260322304.0, - "1625": 46260322304.0, - "1630": 46260322304.0, - "1635": 46260322304.0, - "1640": 46260322304.0, - "1645": 46260322304.0, - "1650": 46260322304.0, - "1655": 46260322304.0, - "1660": 46260322304.0, - "1665": 46260322304.0, - "1670": 46260322304.0, - "1675": 46260322304.0, - "1680": 46260322304.0, - "1685": 46260322304.0, - "1690": 46260322304.0, - "1695": 46260322304.0, - "1700": 46260322304.0, - "1705": 46260322304.0, - "1710": 46260322304.0, - "1715": 46260322304.0, - "1720": 46260322304.0, - "1725": 46260322304.0, - "1730": 46260322304.0, - "1735": 46260322304.0, - "1740": 46260322304.0, - "1745": 46260322304.0, - "1750": 46260322304.0, - "1755": 46260322304.0, - "1760": 46260322304.0, - "1765": 46260322304.0, - "1770": 46260322304.0, - "1775": 46260322304.0, - "1780": 46260322304.0, - "1785": 46260322304.0, - "1790": 46260322304.0, - "1795": 46260322304.0, - "1800": 46260322304.0, - "1805": 46260322304.0, - "1810": 46260322304.0, - "1815": 46260322304.0, - "1820": 46260322304.0, - "1825": 46260322304.0, - "1830": 46260322304.0, - "1835": 46260322304.0, - "1840": 46260322304.0, - "1845": 46260322304.0, - "1850": 46260322304.0, - "1855": 46260322304.0, - "1860": 46260322304.0, - "1865": 46260322304.0, - "1870": 46260322304.0, - "1875": 46260322304.0, - "1880": 46260322304.0, - "1885": 46260322304.0, - "1890": 46260322304.0, - "1895": 46260322304.0, - "1900": 46260322304.0, - "1905": 46260322304.0, - "1910": 46260322304.0, - "1915": 46260322304.0, - "1920": 46260322304.0, - "1925": 46260322304.0, - "1930": 46260322304.0, - "1935": 46260322304.0, - "1940": 46260322304.0, - "1945": 46260322304.0, - "1950": 46260322304.0, - "1955": 46260322304.0, - "1960": 46260322304.0, - "1965": 46260322304.0, - "1970": 46260322304.0, - "1975": 46261714944.0, - "1980": 46261714944.0, - "1985": 46261714944.0, - "1990": 46261714944.0, - "1995": 46261714944.0, - "2000": 46261714944.0, - "2005": 46261714944.0, - "2010": 46261714944.0, - "2015": 46261714944.0, - "2020": 46261714944.0, - "2025": 46261714944.0, - "2030": 46261714944.0, - "2035": 46261714944.0, - "2040": 46261714944.0, - "2045": 46261714944.0, - "2050": 46261714944.0, - "2055": 46261714944.0, - "2060": 46261714944.0, - "2065": 46261714944.0, - "2070": 46261714944.0, - "2075": 46261714944.0, - "2080": 46261714944.0, - "2085": 46261714944.0, - "2090": 46261714944.0, - "2095": 46261714944.0, - "2100": 46261714944.0, - "2105": 46261714944.0, - "2110": 46261714944.0, - "2115": 46261714944.0, - "2120": 46261714944.0, - "2125": 46261714944.0, - "2130": 46261714944.0, - "2135": 46261714944.0, - "2140": 46261714944.0, - "2145": 46261714944.0, - "2150": 46261714944.0, - "2155": 46261714944.0, - "2160": 46261714944.0, - "2165": 46261714944.0, - "2170": 46261714944.0, - "2175": 46261714944.0, - "2180": 46261714944.0, - "2185": 46261714944.0, - "2190": 46261714944.0, - "2195": 46261714944.0, - "2200": 46261714944.0, - "2205": 46261714944.0, - "2210": 46261714944.0, - "2215": 46261714944.0, - "2220": 46261714944.0, - "2225": 46261714944.0, - "2230": 46261714944.0, - "2235": 46261714944.0, - "2240": 46261714944.0, - "2245": 46261714944.0, - "2250": 46261714944.0, - "2255": 46261714944.0, - "2260": 46261714944.0, - "2265": 46261714944.0, - "2270": 46261714944.0, - "2275": 46261714944.0, - "2280": 46261714944.0, - "2285": 46261714944.0, - "2290": 46261714944.0, - "2295": 46261714944.0, - "2300": 46261714944.0, - "2305": 46261714944.0, - "2310": 46261714944.0, - "2315": 46261714944.0, - "2320": 46261714944.0, - "2325": 46261714944.0, - "2330": 46261714944.0, - "2335": 46261714944.0, - "2340": 46261714944.0, - "2345": 46261714944.0, - "2350": 46261714944.0, - "2355": 46261714944.0, - "2360": 46261714944.0, - "2365": 46261714944.0, - "2370": 46261714944.0, - "2375": 46261714944.0, - "2380": 46261714944.0, - "2385": 46261714944.0, - "2390": 46261714944.0, - "2395": 46261714944.0, - "2400": 46261714944.0, - "2405": 46261714944.0, - "2410": 46261714944.0, - "2415": 46261714944.0, - "2420": 46261714944.0, - "2425": 46261714944.0, - "2430": 46261714944.0, - "2435": 46261714944.0, - "2440": 46261714944.0, - "2445": 46261714944.0, - "2450": 46261714944.0, - "2455": 46261714944.0, - "2460": 46261714944.0, - "2465": 46261714944.0, - "2470": 46261714944.0, - "2475": 46261714944.0, - "2480": 46261714944.0, - "2485": 46261714944.0, - "2490": 46261714944.0, - "2495": 46261714944.0, - "2500": 46261714944.0, - "2505": 46261714944.0, - "2510": 46261714944.0, - "2515": 46261714944.0, - "2520": 46261714944.0, - "2525": 46261714944.0, - "2530": 46261714944.0, - "2535": 46261714944.0, - "2540": 46261714944.0, - "2545": 46261714944.0, - "2550": 46261714944.0, - "2555": 46261714944.0, - "2560": 46261714944.0, - "2565": 46261714944.0, - "2570": 46261714944.0, - "2575": 46261714944.0, - "2580": 46261714944.0, - "2585": 46261714944.0, - "2590": 46261714944.0, - "2595": 46261714944.0, - "2600": 46261714944.0, - "2605": 46261714944.0, - "2610": 46261714944.0, - "2615": 46261714944.0, - "2620": 46261714944.0, - "2625": 46261714944.0, - "2630": 46261714944.0, - "2635": 46261714944.0, - "2640": 46261714944.0, - "2645": 46261714944.0, - "2650": 46261714944.0, - "2655": 46261714944.0, - "2660": 46261714944.0, - "2665": 46261714944.0, - "2670": 46261714944.0, - "2675": 46261714944.0, - "2680": 46261714944.0, - "2685": 46261714944.0, - "2690": 46261714944.0, - "2695": 46261714944.0, - "2700": 46261714944.0, - "2705": 46261714944.0, - "2710": 46261714944.0, - "2715": 46261714944.0, - "2720": 46261714944.0, - "2725": 46261714944.0, - "2730": 46261714944.0, - "2735": 46261714944.0, - "2740": 46261714944.0, - "2745": 46261714944.0, - "2750": 46261714944.0, - "2755": 46261714944.0, - "2760": 46261714944.0, - "2765": 46261714944.0, - "2770": 46261714944.0, - "2775": 46261714944.0, - "2780": 46261714944.0, - "2785": 46261714944.0, - "2790": 46261714944.0, - "2795": 46261714944.0, - "2800": 46261714944.0, - "2805": 46261714944.0, - "2810": 46261714944.0, - "2815": 46261714944.0, - "2820": 46261714944.0, - "2825": 46261714944.0, - "2830": 46261714944.0, - "2835": 46261714944.0, - "2840": 46261714944.0, - "2845": 46261714944.0, - "2850": 46261714944.0, - "2855": 46261714944.0, - "2860": 46261714944.0, - "2865": 46261714944.0, - "2870": 46261714944.0, - "2875": 46261714944.0, - "2880": 46261714944.0, - "2885": 46261714944.0, - "2890": 46261714944.0, - "2895": 46261714944.0, - "2900": 46261714944.0, - "2905": 46261714944.0, - "2910": 46261714944.0, - "2915": 46261714944.0, - "2920": 46261714944.0, - "2925": 46261714944.0, - "2930": 46261714944.0, - "2935": 46261714944.0, - "2940": 46261714944.0, - "2945": 46261714944.0, - "2950": 46261714944.0, - "2955": 46261714944.0, - "2960": 46261714944.0, - "2965": 46261714944.0, - "2970": 46261714944.0, - "2975": 46261714944.0, - "2980": 46261714944.0, - "2985": 45706711040.0, - "2990": 45883699200.0, - "2995": 46072287232.0, - "3000": 46072287232.0, - "3005": 46072287232.0, - "3010": 46072287232.0, - "3015": 46072287232.0, - "3020": 46072287232.0, - "3025": 46072287232.0, - "3030": 46072287232.0, - "3035": 46072287232.0, - "3040": 46072287232.0, - "3045": 46072287232.0, - "3050": 46072287232.0, - "3055": 46072287232.0, - "3060": 46072287232.0, - "3065": 46072287232.0, - "3070": 46072287232.0, - "3075": 46072287232.0, - "3080": 46072287232.0, - "3085": 46072287232.0, - "3090": 46072287232.0, - "3095": 46072287232.0, - "3100": 46072287232.0, - "3105": 46072287232.0, - "3110": 46072287232.0, - "3115": 46072287232.0, - "3120": 46072287232.0, - "3125": 46072287232.0, - "3130": 46072287232.0, - "3135": 46072287232.0, - "3140": 46072287232.0, - "3145": 46072287232.0, - "3150": 46072287232.0, - "3155": 46072287232.0, - "3160": 46072287232.0, - "3165": 46072287232.0, - "3170": 46072287232.0, - "3175": 46072287232.0, - "3180": 46072287232.0, - "3185": 46072287232.0, - "3190": 46072287232.0, - "3195": 46072287232.0, - "3200": 46072287232.0, - "3205": 46072287232.0, - "3210": 46072287232.0, - "3215": 46072287232.0, - "3220": 46072287232.0, - "3225": 46072287232.0, - "3230": 46072287232.0, - "3235": 46072287232.0, - "3240": 46072287232.0, - "3245": 46072287232.0, - "3250": 46072287232.0, - "3255": 46072287232.0, - "3260": 46072287232.0, - "3265": 46072287232.0, - "3270": 46072287232.0, - "3275": 46072287232.0, - "3280": 46072287232.0, - "3285": 46072287232.0, - "3290": 46072287232.0, - "3295": 46072287232.0, - "3300": 46072287232.0, - "3305": 46072287232.0, - "3310": 46072287232.0, - "3315": 46072287232.0, - "3320": 46072287232.0, - "3325": 46072287232.0, - "3330": 46072287232.0, - "3335": 46072287232.0, - "3340": 46072287232.0, - "3345": 46072287232.0, - "3350": 46072287232.0, - "3355": 46072287232.0, - "3360": 46072287232.0, - "3365": 46072287232.0, - "3370": 46072287232.0, - "3375": 46072287232.0, - "3380": 46072287232.0, - "3385": 46072287232.0, - "3390": 46072287232.0, - "3395": 46072287232.0, - "3400": 46072287232.0, - "3405": 46072287232.0, - "3410": 46072287232.0, - "3415": 46072287232.0, - "3420": 46072287232.0, - "3425": 46072672256.0, - "3430": 46072672256.0, - "3435": 46072672256.0, - "3440": 46072672256.0, - "3445": 46072672256.0, - "3450": 46072672256.0, - "3455": 46072672256.0, - "3460": 46072672256.0, - "3465": 46072672256.0, - "3470": 46072672256.0, - "3475": 46072672256.0, - "3480": 46072672256.0, - "3485": 46095564800.0, - "3490": 46095564800.0, - "3495": 46095564800.0, - "3500": 46095564800.0, - "3505": 46095564800.0, - "3510": 46095564800.0, - "3515": 46095564800.0, - "3520": 46095564800.0, - "3525": 46095564800.0, - "3530": 46095564800.0, - "3535": 46095564800.0, - "3540": 46095564800.0, - "3545": 46095564800.0, - "3550": 46191697920.0, - "3555": 46191697920.0, - "3560": 46191697920.0, - "3565": 46191697920.0, - "3570": 46191697920.0, - "3575": 46191697920.0, - "3580": 46191697920.0, - "3585": 46191697920.0, - "3590": 46191697920.0, - "3595": 46191697920.0, - "3600": 46191697920.0, - "3605": 46191697920.0, - "3610": 46191697920.0, - "3615": 46191697920.0, - "3620": 46191697920.0, - "3625": 46191697920.0, - "3630": 46191697920.0, - "3635": 46191697920.0, - "3640": 46191697920.0, - "3645": 46191697920.0, - "3650": 46191697920.0, - "3655": 46191697920.0, - "3660": 46191697920.0, - "3665": 46191697920.0, - "3670": 46191697920.0, - "3675": 46191697920.0, - "3680": 46191697920.0, - "3685": 46191697920.0, - "3690": 46191697920.0, - "3695": 46191697920.0, - "3700": 46191697920.0, - "3705": 46191697920.0, - "3710": 46191697920.0, - "3715": 46191697920.0, - "3720": 46191697920.0, - "3725": 46191697920.0, - "3730": 46191697920.0, - "3735": 46191697920.0, - "3740": 46191697920.0, - "3745": 46191697920.0, - "3750": 46191697920.0, - "3755": 46191697920.0, - "3760": 46191697920.0, - "3765": 46191697920.0, - "3770": 46191697920.0, - "3775": 46191697920.0, - "3780": 46191697920.0, - "3785": 46191697920.0, - "3790": 46191697920.0, - "3795": 46191697920.0, - "3800": 46191697920.0, - "3805": 46191697920.0, - "3810": 46191697920.0, - "3815": 46191697920.0, - "3820": 46191697920.0, - "3825": 46191697920.0, - "3830": 46191697920.0, - "3835": 46191697920.0, - "3840": 46191697920.0, - "3845": 46191697920.0, - "3850": 46191697920.0, - "3855": 46191697920.0, - "3860": 46191697920.0, - "3865": 46191697920.0, - "3870": 46191697920.0, - "3875": 46191697920.0, - "3880": 46191697920.0, - "3885": 46191697920.0, - "3890": 46191697920.0, - "3895": 46191697920.0, - "3900": 46191697920.0, - "3905": 46191697920.0, - "3910": 46191697920.0, - "3915": 46191697920.0, - "3920": 46191697920.0, - "3925": 46191697920.0, - "3930": 46191697920.0, - "3935": 46191697920.0, - "3940": 46191697920.0, - "3945": 46191697920.0, - "3950": 46191697920.0, - "3955": 46191697920.0, - "3960": 46191697920.0, - "3965": 46191697920.0, - "3970": 46191697920.0, - "3975": 46191697920.0, - "3980": 46191697920.0, - "3985": 46191697920.0, - "3990": 46191697920.0, - "3995": 46191697920.0, - "4000": 45840449536.0, - "4005": 45869191168.0, - "4010": 45897973760.0, - "4015": 45897973760.0, - "4020": 45940301824.0, - "4025": 45940301824.0, - "4030": 45940301824.0, - "4035": 45940301824.0, - "4040": 45940301824.0, - "4045": 45940301824.0, - "4050": 45940301824.0, - "4055": 45940301824.0, - "4060": 45940301824.0, - "4065": 45940301824.0, - "4070": 45940301824.0, - "4075": 45940301824.0, - "4080": 45940301824.0, - "4085": 46009651200.0, - "4090": 46009651200.0, - "4095": 46009651200.0, - "4100": 46009651200.0, - "4105": 46009651200.0, - "4110": 46009651200.0, - "4115": 46009651200.0, - "4120": 46009651200.0, - "4125": 46009651200.0, - "4130": 46009651200.0, - "4135": 46009651200.0, - "4140": 46009651200.0, - "4145": 46009651200.0, - "4150": 46009651200.0, - "4155": 46009651200.0, - "4160": 46009651200.0, - "4165": 46009651200.0, - "4170": 46009651200.0, - "4175": 46009651200.0, - "4180": 46009651200.0, - "4185": 46009651200.0, - "4190": 46009651200.0, - "4195": 46009651200.0, - "4200": 46009651200.0, - "4205": 46009651200.0, - "4210": 46009651200.0, - "4215": 46009651200.0, - "4220": 46009651200.0, - "4225": 46064635904.0, - "4230": 46064635904.0, - "4235": 46064635904.0, - "4240": 46064635904.0, - "4245": 46064635904.0, - "4250": 46064635904.0, - "4255": 46064635904.0, - "4260": 46064635904.0, - "4265": 46064635904.0, - "4270": 46064635904.0, - "4275": 46064635904.0, - "4280": 46064635904.0, - "4285": 46064635904.0, - "4290": 46064635904.0, - "4295": 46064635904.0, - "4300": 46064635904.0, - "4305": 46064635904.0, - "4310": 46064635904.0, - "4315": 46064635904.0, - "4320": 46064635904.0, - "4325": 46064635904.0, - "4330": 46064635904.0, - "4335": 46064635904.0, - "4340": 46064635904.0, - "4345": 46064635904.0, - "4350": 46064635904.0, - "4355": 46064635904.0, - "4360": 46064635904.0, - "4365": 46064635904.0, - "4370": 46064635904.0, - "4375": 46064635904.0, - "4380": 46064635904.0, - "4385": 46064635904.0, - "4390": 46064635904.0, - "4395": 46064635904.0, - "4400": 46064635904.0, - "4405": 46064635904.0, - "4410": 46064635904.0, - "4415": 46064635904.0, - "4420": 46064635904.0, - "4425": 46064635904.0, - "4430": 46064635904.0, - "4435": 46064635904.0, - "4440": 46064635904.0, - "4445": 46064635904.0, - "4450": 46064635904.0, - "4455": 46064635904.0, - "4460": 46080573440.0, - "4465": 46080573440.0, - "4470": 46080573440.0, - "4475": 46080573440.0, - "4480": 46080573440.0, - "4485": 46080573440.0, - "4490": 46080573440.0, - "4495": 46080573440.0, - "4500": 46080573440.0, - "4505": 46080573440.0, - "4510": 46080573440.0, - "4515": 46080573440.0, - "4520": 46080573440.0, - "4525": 46080573440.0, - "4530": 46080573440.0, - "4535": 46080573440.0, - "4540": 46080573440.0, - "4545": 46080573440.0, - "4550": 46080573440.0, - "4555": 46080573440.0, - "4560": 46080573440.0, - "4565": 46080573440.0, - "4570": 46080573440.0, - "4575": 46080573440.0, - "4580": 46080573440.0, - "4585": 46080573440.0, - "4590": 46080573440.0, - "4595": 46080573440.0, - "4600": 46080573440.0, - "4605": 46080573440.0, - "4610": 46080573440.0, - "4615": 46343888896.0, - "4620": 46343888896.0, - "4625": 46343888896.0, - "4630": 46343888896.0, - "4635": 46343888896.0, - "4640": 46343888896.0, - "4645": 46343888896.0, - "4650": 46343888896.0, - "4655": 46343888896.0, - "4660": 46343888896.0, - "4665": 46343888896.0, - "4670": 46343888896.0, - "4675": 46343888896.0, - "4680": 46343888896.0, - "4685": 46343888896.0, - "4690": 46343888896.0, - "4695": 46343888896.0, - "4700": 46343888896.0, - "4705": 46343888896.0, - "4710": 46343888896.0, - "4715": 46343888896.0, - "4720": 46343888896.0, - "4725": 46343888896.0, - "4730": 46343888896.0, - "4735": 46343888896.0, - "4740": 46343888896.0, - "4745": 46343888896.0, - "4750": 46343888896.0, - "4755": 46343888896.0, - "4760": 46343888896.0, - "4765": 46343888896.0, - "4770": 46343888896.0, - "4775": 46343888896.0, - "4780": 46343888896.0, - "4785": 46343888896.0, - "4790": 46343888896.0, - "4795": 46343888896.0, - "4800": 46343888896.0, - "4805": 46343888896.0, - "4810": 46343888896.0, - "4815": 46343888896.0, - "4820": 46343888896.0, - "4825": 46343888896.0, - "4830": 46343888896.0, - "4835": 46343888896.0, - "4840": 46343888896.0, - "4845": 46343888896.0, - "4850": 46343888896.0, - "4855": 46343888896.0, - "4860": 46343888896.0, - "4865": 46343888896.0, - "4870": 46343888896.0, - "4875": 46343888896.0, - "4880": 46343888896.0, - "4885": 46343888896.0, - "4890": 46343888896.0, - "4895": 46343888896.0, - "4900": 46343888896.0, - "4905": 46343888896.0, - "4910": 46343888896.0, - "4915": 46343888896.0, - "4920": 46343888896.0, - "4925": 46343888896.0, - "4930": 46343888896.0, - "4935": 46343888896.0, - "4940": 46343888896.0, - "4945": 46343888896.0, - "4950": 46343888896.0, - "4955": 46343888896.0, - "4960": 46343888896.0, - "4965": 46343888896.0, - "4970": 46343888896.0, - "4975": 46343888896.0, - "4980": 46343888896.0, - "4985": 46343888896.0, - "4990": 46343888896.0, - "4995": 46343888896.0, - "5000": 46343888896.0, - "5005": 46199529472.0, - "5010": 46199529472.0, - "5015": 45764182016.0, - "5020": 45878784000.0, - "5025": 45878784000.0, - "5030": 45878784000.0, - "5035": 45878784000.0, - "5040": 45992685568.0, - "5045": 45992685568.0, - "5050": 45992685568.0, - "5055": 45992685568.0, - "5060": 45992685568.0, - "5065": 45992685568.0, - "5070": 45992685568.0, - "5075": 45992685568.0, - "5080": 45992685568.0, - "5085": 45992685568.0, - "5090": 45992685568.0, - "5095": 46014451712.0, - "5100": 46014451712.0, - "5105": 46014451712.0, - "5110": 46014451712.0, - "5115": 46014451712.0, - "5120": 46014451712.0, - "5125": 46014451712.0, - "5130": 46014451712.0, - "5135": 46014451712.0, - "5140": 46014451712.0, - "5145": 46014451712.0, - "5150": 46014451712.0, - "5155": 46014451712.0, - "5160": 46014451712.0, - "5165": 46014451712.0, - "5170": 46014451712.0, - "5175": 46014451712.0, - "5180": 46014451712.0, - "5185": 46014451712.0, - "5190": 46014451712.0, - "5195": 46014451712.0, - "5200": 46139572224.0, - "5205": 46139572224.0, - "5210": 46139572224.0, - "5215": 46139572224.0, - "5220": 46168403968.0, - "5225": 46168403968.0, - "5230": 46168403968.0, - "5235": 46168403968.0, - "5240": 46168403968.0, - "5245": 46168403968.0, - "5250": 46168403968.0, - "5255": 46168403968.0, - "5260": 46168403968.0, - "5265": 46168403968.0, - "5270": 46168403968.0, - "5275": 46168403968.0, - "5280": 46168403968.0, - "5285": 46168403968.0, - "5290": 46168403968.0, - "5295": 46168403968.0, - "5300": 46168403968.0, - "5305": 46168403968.0, - "5310": 46168403968.0, - "5315": 46168403968.0, - "5320": 46168403968.0, - "5325": 46168403968.0, - "5330": 46168403968.0, - "5335": 46168403968.0, - "5340": 46168403968.0, - "5345": 46168403968.0, - "5350": 46168403968.0, - "5355": 46168403968.0, - "5360": 46168403968.0, - "5365": 46168403968.0, - "5370": 46168403968.0, - "5375": 46168403968.0, - "5380": 46168403968.0, - "5385": 46168403968.0, - "5390": 46168403968.0, - "5395": 46168403968.0, - "5400": 46168403968.0, - "5405": 46168403968.0, - "5410": 46168403968.0, - "5415": 46168403968.0, - "5420": 46168403968.0, - "5425": 46168403968.0, - "5430": 46168403968.0, - "5435": 46168403968.0, - "5440": 46168403968.0, - "5445": 46168403968.0, - "5450": 46168403968.0, - "5455": 46168403968.0, - "5460": 46168403968.0, - "5465": 46168403968.0, - "5470": 46168403968.0, - "5475": 46168403968.0, - "5480": 46168403968.0, - "5485": 46168403968.0, - "5490": 46168403968.0, - "5495": 46168403968.0, - "5500": 46168403968.0, - "5505": 46168403968.0, - "5510": 46168403968.0, - "5515": 46168403968.0, - "5520": 46168403968.0, - "5525": 46168403968.0, - "5530": 46168403968.0, - "5535": 46168403968.0, - "5540": 46168403968.0, - "5545": 46168403968.0, - "5550": 46168403968.0, - "5555": 46168403968.0, - "5560": 46168403968.0, - "5565": 46168403968.0, - "5570": 46168403968.0, - "5575": 46168403968.0, - "5580": 46168403968.0, - "5585": 46168403968.0, - "5590": 46168403968.0, - "5595": 46168403968.0, - "5600": 46168403968.0, - "5605": 46226247680.0, - "5610": 46226247680.0, - "5615": 46226247680.0, - "5620": 46226247680.0, - "5625": 46226247680.0, - "5630": 46226247680.0, - "5635": 46226247680.0, - "5640": 46226247680.0, - "5645": 46226247680.0, - "5650": 46226247680.0, - "5655": 46226247680.0, - "5660": 46226247680.0, - "5665": 46226247680.0, - "5670": 46226247680.0, - "5675": 46226247680.0, - "5680": 46226247680.0, - "5685": 46226247680.0, - "5690": 46226247680.0, - "5695": 46226247680.0, - "5700": 46226247680.0, - "5705": 46226247680.0, - "5710": 46226247680.0, - "5715": 46226247680.0, - "5720": 46226247680.0, - "5725": 46226247680.0, - "5730": 46226247680.0, - "5735": 46226247680.0, - "5740": 46226247680.0, - "5745": 46226247680.0, - "5750": 46226247680.0, - "5755": 46226247680.0, - "5760": 46226247680.0, - "5765": 46226247680.0, - "5770": 46226247680.0, - "5775": 46226247680.0, - "5780": 46226247680.0, - "5785": 46226247680.0, - "5790": 46226247680.0, - "5795": 46226247680.0, - "5800": 46226247680.0, - "5805": 46226247680.0, - "5810": 46226247680.0, - "5815": 46226247680.0, - "5820": 46226247680.0, - "5825": 46226247680.0, - "5830": 46226247680.0, - "5835": 46226247680.0, - "5840": 46226247680.0, - "5845": 46226247680.0, - "5850": 46226247680.0, - "5855": 46226247680.0, - "5860": 46226247680.0, - "5865": 46226247680.0, - "5870": 46226247680.0, - "5875": 46226247680.0, - "5880": 46226247680.0, - "5885": 46226247680.0, - "5890": 46226247680.0, - "5895": 46226247680.0, - "5900": 46226247680.0, - "5905": 46226247680.0, - "5910": 46226247680.0, - "5915": 46226247680.0, - "5920": 46226247680.0, - "5925": 46226247680.0, - "5930": 46226247680.0, - "5935": 46226247680.0, - "5940": 46226247680.0, - "5945": 46226247680.0, - "5950": 46226247680.0, - "5955": 46226247680.0, - "5960": 46226247680.0, - "5965": 46226247680.0, - "5970": 46226247680.0, - "5975": 46226247680.0, - "5980": 46226247680.0, - "5985": 46226247680.0, - "5990": 46226247680.0, - "5995": 46226247680.0, - "6000": 46226247680.0, - "6005": 46226247680.0, - "6010": 46226247680.0, - "6015": 46226247680.0, - "6020": 46226247680.0, - "6025": 46226247680.0, - "6030": 45912186880.0, - "6035": 45912186880.0, - "6040": 45995683840.0, - "6045": 45995683840.0, - "6050": 45995683840.0, - "6055": 45995683840.0, - "6060": 45995683840.0, - "6065": 45995683840.0, - "6070": 45995683840.0, - "6075": 46014836736.0, - "6080": 46014836736.0, - "6085": 46014836736.0, - "6090": 46014836736.0, - "6095": 46014836736.0, - "6100": 46014836736.0, - "6105": 46014836736.0, - "6110": 46014836736.0, - "6115": 46014836736.0, - "6120": 46014836736.0, - "6125": 46014836736.0, - "6130": 46014836736.0, - "6135": 46014836736.0, - "6140": 46014836736.0, - "6145": 46014836736.0, - "6150": 46014836736.0, - "6155": 46014836736.0, - "6160": 46014836736.0, - "6165": 46025334784.0, - "6170": 46025334784.0, - "6175": 46025334784.0, - "6180": 46025334784.0, - "6185": 46035255296.0, - "6190": 46035255296.0, - "6195": 46035255296.0, - "6200": 46035255296.0, - "6205": 46035255296.0, - "6210": 46035255296.0, - "6215": 46035255296.0, - "6220": 46035255296.0, - "6225": 46035255296.0, - "6230": 46035255296.0, - "6235": 46035255296.0, - "6240": 46035255296.0, - "6245": 46035255296.0, - "6250": 46035255296.0, - "6255": 46035255296.0, - "6260": 46035255296.0, - "6265": 46035255296.0, - "6270": 46035255296.0, - "6275": 46035255296.0, - "6280": 46035255296.0, - "6285": 46035255296.0, - "6290": 46035255296.0, - "6295": 46035255296.0, - "6300": 46035255296.0, - "6305": 46035255296.0, - "6310": 46035255296.0, - "6315": 46035255296.0, - "6320": 46035255296.0, - "6325": 46035255296.0, - "6330": 46035255296.0, - "6335": 46035255296.0, - "6340": 46035255296.0, - "6345": 46035255296.0, - "6350": 46035255296.0, - "6355": 46035255296.0, - "6360": 46035255296.0, - "6365": 46035255296.0, - "6370": 46035255296.0, - "6375": 46035255296.0, - "6380": 46035255296.0, - "6385": 46035255296.0, - "6390": 46035255296.0, - "6395": 46035255296.0, - "6400": 46035255296.0, - "6405": 46035255296.0, - "6410": 46035255296.0, - "6415": 46035255296.0, - "6420": 46035255296.0, - "6425": 46035255296.0, - "6430": 46035255296.0, - "6435": 46035255296.0, - "6440": 46035255296.0, - "6445": 46035255296.0, - "6450": 46035255296.0, - "6455": 46035255296.0, - "6460": 46035255296.0, - "6465": 46035255296.0, - "6470": 46035255296.0, - "6475": 46035255296.0, - "6480": 46035255296.0, - "6485": 46035255296.0, - "6490": 46035255296.0, - "6495": 46035255296.0, - "6500": 46035255296.0, - "6505": 46064041984.0, - "6510": 46064041984.0, - "6515": 46064041984.0, - "6520": 46064041984.0, - "6525": 46064041984.0, - "6530": 46064041984.0, - "6535": 46064041984.0, - "6540": 46064041984.0, - "6545": 46064041984.0, - "6550": 46064041984.0, - "6555": 46064041984.0, - "6560": 46064041984.0, - "6565": 46064041984.0, - "6570": 46064041984.0, - "6575": 46064041984.0, - "6580": 46064041984.0, - "6585": 46064041984.0, - "6590": 46064041984.0, - "6595": 46064041984.0, - "6600": 46064041984.0, - "6605": 46064041984.0, - "6610": 46064041984.0, - "6615": 46064041984.0, - "6620": 46064041984.0, - "6625": 46064041984.0, - "6630": 46064041984.0, - "6635": 46064041984.0, - "6640": 46064041984.0, - "6645": 46064041984.0, - "6650": 46064041984.0, - "6655": 46064041984.0, - "6660": 46064041984.0, - "6665": 46064041984.0, - "6670": 46064041984.0, - "6675": 46064041984.0, - "6680": 46064041984.0, - "6685": 46064041984.0, - "6690": 46064041984.0, - "6695": 46064041984.0, - "6700": 46064041984.0, - "6705": 46064041984.0, - "6710": 46064041984.0, - "6715": 46064041984.0, - "6720": 46064041984.0, - "6725": 46064041984.0, - "6730": 46064041984.0, - "6735": 46064041984.0, - "6740": 46064041984.0, - "6745": 46064041984.0, - "6750": 46064041984.0, - "6755": 46064041984.0, - "6760": 46064041984.0, - "6765": 46064041984.0, - "6770": 46064041984.0, - "6775": 46064041984.0, - "6780": 46064041984.0, - "6785": 46064041984.0, - "6790": 46064041984.0, - "6795": 46064041984.0, - "6800": 46064041984.0, - "6805": 46064041984.0, - "6810": 46064041984.0, - "6815": 46064041984.0, - "6820": 46064041984.0, - "6825": 46064041984.0, - "6830": 46064041984.0, - "6835": 46064041984.0, - "6840": 46064041984.0, - "6845": 46064041984.0, - "6850": 46064041984.0, - "6855": 46064041984.0, - "6860": 46064041984.0, - "6865": 46064041984.0, - "6870": 46064041984.0, - "6875": 46064041984.0, - "6880": 46064041984.0, - "6885": 46064041984.0, - "6890": 46064041984.0, - "6895": 46064041984.0, - "6900": 46064041984.0, - "6905": 46064041984.0, - "6910": 46064041984.0, - "6915": 46064041984.0, - "6920": 46064041984.0, - "6925": 46064041984.0, - "6930": 46064041984.0, - "6935": 46064041984.0, - "6940": 46064041984.0, - "6945": 46064041984.0, - "6950": 46064041984.0, - "6955": 46064041984.0, - "6960": 46064041984.0, - "6965": 46064041984.0, - "6970": 46064041984.0, - "6975": 46064041984.0, - "6980": 46064041984.0, - "6985": 46064041984.0, - "6990": 46064041984.0, - "6995": 46064041984.0, - "7000": 46064041984.0, - "7005": 46064041984.0, - "7010": 46064041984.0, - "7015": 46064041984.0, - "7020": 46064041984.0, - "7025": 46064041984.0, - "7030": 46108979200.0, - "7035": 46108979200.0, - "7040": 46108979200.0, - "7045": 46108979200.0, - "7050": 46065532928.0, - "7055": 46065532928.0, - "7060": 46065532928.0, - "7065": 46065532928.0, - "7070": 46065532928.0, - "7075": 46065532928.0, - "7080": 46065532928.0, - "7085": 46065532928.0, - "7090": 46065532928.0, - "7095": 46065532928.0, - "7100": 46065532928.0, - "7105": 46065532928.0, - "7110": 46065532928.0, - "7115": 46065532928.0, - "7120": 46065532928.0, - "7125": 46065532928.0, - "7130": 46065532928.0, - "7135": 46065532928.0, - "7140": 46065532928.0, - "7145": 46065532928.0, - "7150": 46065532928.0, - "7155": 46065532928.0, - "7160": 46065532928.0, - "7165": 46065532928.0, - "7170": 46065532928.0, - "7175": 46065532928.0, - "7180": 46065532928.0, - "7185": 46065532928.0, - "7190": 46065532928.0, - "7195": 46065532928.0, - "7200": 46065532928.0, - "7205": 46065532928.0, - "7210": 46065532928.0, - "7215": 46065532928.0, - "7220": 46065532928.0, - "7225": 46065532928.0, - "7230": 46065532928.0, - "7235": 46065532928.0, - "7240": 46065532928.0, - "7245": 46065532928.0, - "7250": 46065532928.0, - "7255": 46065532928.0, - "7260": 46065532928.0, - "7265": 46065532928.0, - "7270": 46065532928.0, - "7275": 46065532928.0, - "7280": 46065532928.0, - "7285": 46065532928.0, - "7290": 46065532928.0, - "7295": 46065532928.0, - "7300": 46065532928.0, - "7305": 46065532928.0, - "7310": 46065532928.0, - "7315": 46065532928.0, - "7320": 46065532928.0, - "7325": 46065532928.0, - "7330": 46065532928.0, - "7335": 46065532928.0, - "7340": 46065532928.0, - "7345": 46065532928.0, - "7350": 46065532928.0, - "7355": 46065532928.0, - "7360": 46065532928.0, - "7365": 46065532928.0, - "7370": 46065532928.0, - "7375": 46065532928.0, - "7380": 46065532928.0, - "7385": 46065532928.0, - "7390": 46065532928.0, - "7395": 46065532928.0, - "7400": 46065532928.0, - "7405": 46065532928.0, - "7410": 46065532928.0, - "7415": 46065532928.0, - "7420": 46065532928.0, - "7425": 46065532928.0, - "7430": 46065532928.0, - "7435": 46065532928.0, - "7440": 46065532928.0, - "7445": 46065532928.0, - "7450": 46065532928.0, - "7455": 46065532928.0, - "7460": 46065532928.0, - "7465": 46065532928.0, - "7470": 46065532928.0, - "7475": 46065532928.0, - "7480": 46065532928.0, - "7485": 46065532928.0, - "7490": 46065532928.0, - "7495": 46065532928.0, - "7500": 46065532928.0, - "7505": 46065532928.0, - "7510": 46065532928.0, - "7515": 46065532928.0, - "7520": 45618061312.0, - "7525": 45747933184.0, - "7530": 45825024000.0, - "7535": 45825024000.0, - "7540": 45825024000.0, - "7545": 45910597632.0, - "7550": 45910597632.0, - "7555": 45910597632.0, - "7560": 45910597632.0, - "7565": 45910597632.0, - "7570": 45910597632.0, - "7575": 45910597632.0, - "7580": 45910597632.0, - "7585": 45910597632.0, - "7590": 45910597632.0, - "7595": 45916950528.0, - "7600": 45924253696.0, - "7605": 45924253696.0, - "7610": 45924253696.0, - "7615": 45924253696.0, - "7620": 45924253696.0, - "7625": 45924253696.0, - "7630": 45924253696.0, - "7635": 45924253696.0, - "7640": 45924253696.0, - "7645": 45944950784.0, - "7650": 45944950784.0, - "7655": 45944950784.0, - "7660": 45944950784.0, - "7665": 45944950784.0, - "7670": 45944950784.0, - "7675": 45944950784.0, - "7680": 45944950784.0, - "7685": 45944950784.0, - "7690": 45944950784.0, - "7695": 45944950784.0, - "7700": 45944950784.0, - "7705": 45944950784.0, - "7710": 45944950784.0, - "7715": 45944950784.0, - "7720": 45944950784.0, - "7725": 45944950784.0, - "7730": 45944950784.0, - "7735": 45944950784.0, - "7740": 45944950784.0, - "7745": 45944950784.0, - "7750": 45944950784.0, - "7755": 45944950784.0, - "7760": 45944950784.0, - "7765": 45944950784.0, - "7770": 45944950784.0, - "7775": 45944950784.0, - "7780": 45944950784.0, - "7785": 45944950784.0, - "7790": 45944950784.0, - "7795": 45944950784.0, - "7800": 45944950784.0, - "7805": 45944950784.0, - "7810": 45944950784.0, - "7815": 45944950784.0, - "7820": 45944950784.0, - "7825": 45944950784.0, - "7830": 45944950784.0, - "7835": 45944950784.0, - "7840": 45973135360.0, - "7845": 45973135360.0, - "7850": 46089904128.0, - "7855": 46089904128.0, - "7860": 46089904128.0, - "7865": 46089904128.0, - "7870": 46089904128.0, - "7875": 46089904128.0, - "7880": 46089904128.0, - "7885": 46089904128.0, - "7890": 46089904128.0, - "7895": 46089904128.0, - "7900": 46089904128.0, - "7905": 46089904128.0, - "7910": 46089904128.0, - "7915": 46089904128.0, - "7920": 46089904128.0, - "7925": 46089904128.0, - "7930": 46089904128.0, - "7935": 46089904128.0, - "7940": 46089904128.0, - "7945": 46089904128.0, - "7950": 46089904128.0, - "7955": 46089904128.0, - "7960": 46089904128.0, - "7965": 46089904128.0, - "7970": 46089904128.0, - "7975": 46089904128.0, - "7980": 46089904128.0, - "7985": 46089904128.0, - "7990": 46089904128.0, - "7995": 46089904128.0, - "8000": 46089904128.0, - "8005": 46089904128.0, - "8010": 46089904128.0, - "8015": 46089904128.0, - "8020": 46089904128.0, - "8025": 46089904128.0, - "8030": 46089904128.0, - "8035": 46089904128.0, - "8040": 46089904128.0, - "8045": 46089904128.0, - "8050": 46089904128.0, - "8055": 46089904128.0, - "8060": 46089904128.0, - "8065": 46089904128.0, - "8070": 46089904128.0, - "8075": 46089904128.0, - "8080": 46089904128.0, - "8085": 46089904128.0, - "8090": 46089904128.0, - "8095": 46089904128.0, - "8100": 46089904128.0, - "8105": 46089904128.0, - "8110": 46089904128.0, - "8115": 46089904128.0, - "8120": 46089904128.0, - "8125": 46089904128.0, - "8130": 46089904128.0, - "8135": 46089904128.0, - "8140": 46089904128.0, - "8145": 46089904128.0, - "8150": 46089904128.0, - "8155": 46089904128.0, - "8160": 46089904128.0, - "8165": 46089904128.0, - "8170": 46089904128.0, - "8175": 46089904128.0, - "8180": 46089904128.0, - "8185": 46089904128.0, - "8190": 46089904128.0, - "8195": 46089904128.0, - "8200": 46089904128.0, - "8205": 46089904128.0, - "8210": 46089904128.0, - "8215": 46089904128.0, - "8220": 46089904128.0, - "8225": 46089904128.0, - "8230": 46089904128.0, - "8235": 46089904128.0, - "8240": 46089904128.0, - "8245": 46089904128.0, - "8250": 46089904128.0, - "8255": 46089904128.0, - "8260": 46089904128.0, - "8265": 46089904128.0, - "8270": 46089904128.0, - "8275": 46089904128.0, - "8280": 46089904128.0, - "8285": 46089904128.0, - "8290": 46089904128.0, - "8295": 46089904128.0, - "8300": 46089904128.0, - "8305": 46089904128.0, - "8310": 46089904128.0, - "8315": 46089904128.0, - "8320": 46089904128.0, - "8325": 46089904128.0, - "8330": 46089904128.0, - "8335": 46089904128.0, - "8340": 46089904128.0, - "8345": 46089904128.0, - "8350": 46089904128.0, - "8355": 46089904128.0, - "8360": 46089904128.0, - "8365": 46089904128.0, - "8370": 46089904128.0, - "8375": 46089904128.0, - "8380": 46089904128.0, - "8385": 46089904128.0, - "8390": 46089904128.0, - "8395": 46089904128.0, - "8400": 46089904128.0, - "8405": 46089904128.0, - "8410": 46089904128.0, - "8415": 46089904128.0, - "8420": 46089904128.0, - "8425": 46089904128.0, - "8430": 46089904128.0, - "8435": 46089904128.0, - "8440": 46089904128.0, - "8445": 46089904128.0, - "8450": 46089904128.0, - "8455": 46089904128.0, - "8460": 46089904128.0, - "8465": 46089904128.0, - "8470": 46089904128.0, - "8475": 46089904128.0, - "8480": 46089904128.0, - "8485": 46089904128.0, - "8490": 46089904128.0, - "8495": 46089904128.0, - "8500": 46089904128.0, - "8505": 46089904128.0, - "8510": 46089904128.0, - "8515": 46089904128.0, - "8520": 46089904128.0, - "8525": 46089904128.0, - "8530": 45938114560.0, - "8535": 45938114560.0, - "8540": 45938114560.0, - "8545": 45938114560.0, - "8550": 45938114560.0, - "8555": 45938114560.0, - "8560": 45938114560.0, - "8565": 45938114560.0, - "8570": 45938114560.0, - "8575": 45938114560.0, - "8580": 45938114560.0, - "8585": 45938114560.0, - "8590": 45950377984.0, - "8595": 45950377984.0, - "8600": 45950377984.0, - "8605": 45950377984.0, - "8610": 45950377984.0, - "8615": 45950377984.0, - "8620": 45950377984.0, - "8625": 45950377984.0, - "8630": 45950377984.0, - "8635": 45950377984.0, - "8640": 45950377984.0, - "8645": 45950377984.0, - "8650": 45950377984.0, - "8655": 45950377984.0, - "8660": 45950377984.0, - "8665": 45950377984.0, - "8670": 45955510272.0, - "8675": 45955510272.0, - "8680": 45955510272.0, - "8685": 45955510272.0, - "8690": 45991550976.0, - "8695": 45991550976.0, - "8700": 45991550976.0, - "8705": 45991550976.0, - "8710": 45991550976.0, - "8715": 45991550976.0, - "8720": 45991550976.0, - "8725": 45991550976.0, - "8730": 45991550976.0, - "8735": 45991550976.0, - "8740": 46068584448.0, - "8745": 46068584448.0, - "8750": 46068584448.0, - "8755": 46068584448.0, - "8760": 46068584448.0, - "8765": 46068584448.0, - "8770": 46068584448.0, - "8775": 46068584448.0, - "8780": 46068584448.0, - "8785": 46068584448.0, - "8790": 46068584448.0, - "8795": 46068584448.0, - "8800": 46068584448.0, - "8805": 46068584448.0, - "8810": 46068584448.0, - "8815": 46068584448.0, - "8820": 46068584448.0, - "8825": 46068584448.0, - "8830": 46068584448.0, - "8835": 46068584448.0, - "8840": 46068584448.0, - "8845": 46068584448.0, - "8850": 46068584448.0, - "8855": 46184767488.0, - "8860": 46184767488.0, - "8865": 46184767488.0, - "8870": 46184767488.0, - "8875": 46184767488.0, - "8880": 46184767488.0, - "8885": 46184767488.0, - "8890": 46184767488.0, - "8895": 46184767488.0, - "8900": 46184767488.0, - "8905": 46184767488.0, - "8910": 46184767488.0, - "8915": 46184767488.0, - "8920": 46184767488.0, - "8925": 46184767488.0, - "8930": 46184767488.0, - "8935": 46184767488.0, - "8940": 46184767488.0, - "8945": 46184767488.0, - "8950": 46184767488.0, - "8955": 46184767488.0, - "8960": 46184767488.0, - "8965": 46184767488.0, - "8970": 46184767488.0, - "8975": 46184767488.0, - "8980": 46184767488.0, - "8985": 46184767488.0, - "8990": 46184767488.0, - "8995": 46184767488.0, - "9000": 46184767488.0, - "9005": 46184767488.0, - "9010": 46184767488.0, - "9015": 46184767488.0, - "9020": 46184767488.0, - "9025": 46184767488.0, - "9030": 46184767488.0, - "9035": 46184767488.0, - "9040": 46184767488.0, - "9045": 46184767488.0, - "9050": 46184767488.0, - "9055": 46184767488.0, - "9060": 46184767488.0, - "9065": 46184767488.0, - "9070": 46184767488.0, - "9075": 46184767488.0, - "9080": 46184767488.0, - "9085": 46184767488.0, - "9090": 46184767488.0, - "9095": 46184767488.0, - "9100": 46184767488.0, - "9105": 46184767488.0, - "9110": 46184767488.0, - "9115": 46184767488.0, - "9120": 46184767488.0, - "9125": 46184767488.0, - "9130": 46184767488.0, - "9135": 46184767488.0, - "9140": 46184767488.0, - "9145": 46184767488.0, - "9150": 46184767488.0, - "9155": 46184767488.0, - "9160": 46184767488.0, - "9165": 46184767488.0, - "9170": 46184767488.0, - "9175": 46184767488.0, - "9180": 46184767488.0, - "9185": 46184767488.0, - "9190": 46184767488.0, - "9195": 46184767488.0, - "9200": 46184767488.0, - "9205": 46184767488.0, - "9210": 46184767488.0, - "9215": 46184767488.0, - "9220": 46184767488.0, - "9225": 46184767488.0, - "9230": 46184767488.0, - "9235": 46184767488.0, - "9240": 46184767488.0, - "9245": 46184767488.0, - "9250": 46184767488.0, - "9255": 46184767488.0, - "9260": 46184767488.0, - "9265": 46184767488.0, - "9270": 46184767488.0, - "9275": 46184767488.0, - "9280": 46184767488.0, - "9285": 46184767488.0, - "9290": 46184767488.0, - "9295": 46184767488.0, - "9300": 46184767488.0, - "9305": 46184767488.0, - "9310": 46184767488.0, - "9315": 46184767488.0, - "9320": 46184767488.0, - "9325": 46184767488.0, - "9330": 46184767488.0, - "9335": 46184767488.0, - "9340": 46184767488.0, - "9345": 46184767488.0, - "9350": 46184767488.0, - "9355": 46184767488.0, - "9360": 46184767488.0, - "9365": 46184767488.0, - "9370": 46184767488.0, - "9375": 46184767488.0, - "9380": 46184767488.0, - "9385": 46184767488.0, - "9390": 46184767488.0, - "9395": 46184767488.0, - "9400": 46184767488.0, - "9405": 46184767488.0, - "9410": 46184767488.0, - "9415": 46184767488.0, - "9420": 46184767488.0, - "9425": 46184767488.0, - "9430": 46184767488.0, - "9435": 46184767488.0, - "9440": 46184767488.0, - "9445": 46184767488.0, - "9450": 46184767488.0, - "9455": 46184767488.0, - "9460": 46184767488.0, - "9465": 46184767488.0, - "9470": 46184767488.0, - "9475": 46184767488.0, - "9480": 46184767488.0, - "9485": 46184767488.0, - "9490": 46184767488.0, - "9495": 46184767488.0, - "9500": 46184767488.0, - "9505": 46184767488.0, - "9510": 46184767488.0, - "9515": 46184767488.0, - "9520": 46184767488.0, - "9525": 46184767488.0, - "9530": 46184767488.0, - "9535": 46184767488.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 13.88878, - "5": 13.88979, - "10": 13.88767, - "15": 13.88576, - "20": 13.88068, - "25": 13.87774, - "30": 13.85566, - "35": 13.84855, - "40": 13.84546, - "45": 13.82693, - "50": 13.74828, - "55": 13.7249, - "60": 13.70841, - "65": 13.67571, - "70": 13.63981, - "75": 13.44327, - "80": 13.36054, - "85": 13.2835, - "90": 13.18641, - "95": 13.0505, - "100": 12.90733, - "105": 12.74689, - "110": 12.48525, - "115": 12.26801, - "120": 12.04358, - "125": 11.87011, - "130": 11.74911, - "135": 11.5841, - "140": 11.3494, - "145": 11.26997, - "150": 11.11919, - "155": 11.0211, - "160": 10.88133, - "165": 10.75162, - "170": 10.65694, - "175": 10.59566, - "180": 10.43546, - "185": 10.42441, - "190": 10.27183, - "195": 10.2539, - "200": 10.12718, - "205": 9.97472, - "210": 9.94271, - "215": 9.92122, - "220": 9.78944, - "225": 9.77014, - "230": 9.73, - "235": 9.64372, - "240": 9.57366, - "245": 9.50499, - "250": 9.43776, - "255": 9.37037, - "260": 9.29579, - "265": 9.2411, - "270": 9.15629, - "275": 9.12851, - "280": 9.10516, - "285": 9.09815, - "290": 9.01068, - "295": 8.94828, - "300": 8.83207, - "305": 8.80663, - "310": 8.74389, - "315": 8.71813, - "320": 8.68425, - "325": 8.58706, - "330": 8.56208, - "335": 8.53307, - "340": 8.52937, - "345": 8.41091, - "350": 8.39973, - "355": 8.29759, - "360": 8.38348, - "365": 8.28981, - "370": 8.2833, - "375": 8.22588, - "380": 8.18359, - "385": 8.16998, - "390": 8.1467, - "395": 8.09789, - "400": 8.01583, - "405": 8.01349, - "410": 8.00377, - "415": 7.95012, - "420": 7.93109, - "425": 7.88677, - "430": 7.81895, - "435": 7.82989, - "440": 7.77278, - "445": 7.7493, - "450": 7.67877, - "455": 7.7063, - "460": 7.6532, - "465": 7.6329, - "470": 7.59885, - "475": 7.61277, - "480": 7.48436, - "485": 7.53153, - "490": 7.48574, - "495": 7.4714, - "500": 7.41282, - "505": 7.41932, - "510": 7.38698, - "515": 7.35645, - "520": 7.35102, - "525": 7.32559, - "530": 7.32588, - "535": 7.30357, - "540": 7.2179, - "545": 7.24022, - "550": 7.27618, - "555": 7.30238, - "560": 7.23984, - "565": 7.16321, - "570": 7.17228, - "575": 7.18898, - "580": 7.11497, - "585": 7.11901, - "590": 7.06121, - "595": 7.04317, - "600": 7.06682, - "605": 7.06137, - "610": 7.01939, - "615": 7.078, - "620": 6.98113, - "625": 6.95612, - "630": 6.96104, - "635": 6.98871, - "640": 6.96819, - "645": 6.95817, - "650": 7.00625, - "655": 7.00242, - "660": 6.89823, - "665": 6.88159, - "670": 6.84888, - "675": 6.93827, - "680": 6.89638, - "685": 6.85679, - "690": 6.83445, - "695": 6.79719, - "700": 6.79183, - "705": 6.78625, - "710": 6.82275, - "715": 6.82665, - "720": 6.71137, - "725": 6.76643, - "730": 6.75579, - "735": 6.75515, - "740": 6.70045, - "745": 6.67565, - "750": 6.73564, - "755": 6.65767, - "760": 6.66496, - "765": 6.65951, - "770": 6.68075, - "775": 6.65453, - "780": 6.62427, - "785": 6.64321, - "790": 6.59399, - "795": 6.59812, - "800": 6.5878, - "805": 6.65391, - "810": 6.51946, - "815": 6.5419, - "820": 6.55134, - "825": 6.55855, - "830": 6.57041, - "835": 6.52603, - "840": 6.49033, - "845": 6.54438, - "850": 6.49874, - "855": 6.49335, - "860": 6.49024, - "865": 6.49642, - "870": 6.46222, - "875": 6.51054, - "880": 6.4748, - "885": 6.43786, - "890": 6.51246, - "895": 6.39629, - "900": 6.41895, - "905": 6.44341, - "910": 6.40617, - "915": 6.38978, - "920": 6.38772, - "925": 6.37391, - "930": 6.40825, - "935": 6.39755, - "940": 6.34172, - "945": 6.36869, - "950": 6.3953, - "955": 6.34893, - "960": 6.35406, - "965": 6.25416, - "970": 6.32381, - "975": 6.31262, - "980": 6.28797, - "985": 6.29222, - "990": 6.34527, - "995": 6.26326, - "1000": 6.28434, - "1005": 6.23155, - "1010": 6.26712, - "1015": 6.29352, - "1020": 6.20454, - "1025": 6.21082, - "1030": 6.20913, - "1035": 6.29924, - "1040": 6.22531, - "1045": 6.19943, - "1050": 6.2267, - "1055": 6.21777, - "1060": 6.1673, - "1065": 6.15758, - "1070": 6.19281, - "1075": 6.19093, - "1080": 6.19319, - "1085": 6.19606, - "1090": 6.17796, - "1095": 6.181, - "1100": 6.1397, - "1105": 6.11513, - "1110": 6.17787, - "1115": 6.11231, - "1120": 6.05286, - "1125": 6.08699, - "1130": 6.14167, - "1135": 6.09531, - "1140": 6.08221, - "1145": 6.06731, - "1150": 6.09458, - "1155": 6.06298, - "1160": 6.04607, - "1165": 6.09676, - "1170": 6.07336, - "1175": 6.04568, - "1180": 6.05058, - "1185": 6.04124, - "1190": 6.04961, - "1195": 6.02949, - "1200": 5.97329, - "1205": 6.07601, - "1210": 5.93751, - "1215": 5.98403, - "1220": 6.06306, - "1225": 5.95152, - "1230": 5.99877, - "1235": 5.95912, - "1240": 5.99322, - "1245": 5.97187, - "1250": 5.95299, - "1255": 5.94742, - "1260": 5.95227, - "1265": 5.93352, - "1270": 5.90818, - "1275": 5.96805, - "1280": 5.90416, - "1285": 5.92308, - "1290": 5.90725, - "1295": 5.92, - "1300": 5.9267, - "1305": 5.90057, - "1310": 5.83908, - "1315": 5.8992, - "1320": 5.89614, - "1325": 5.8271, - "1330": 5.88462, - "1335": 5.8531, - "1340": 5.91994, - "1345": 5.86667, - "1350": 5.84738, - "1355": 5.84415, - "1360": 5.85216, - "1365": 5.84478, - "1370": 5.79663, - "1375": 5.80667, - "1380": 5.86219, - "1385": 5.81826, - "1390": 5.81231, - "1395": 5.8299, - "1400": 5.83135, - "1405": 5.82032, - "1410": 5.78518, - "1415": 5.77017, - "1420": 5.8049, - "1425": 5.79565, - "1430": 5.83189, - "1435": 5.74562, - "1440": 5.76408, - "1445": 5.8071, - "1450": 5.78859, - "1455": 5.80534, - "1460": 5.75975, - "1465": 5.76379, - "1470": 5.8044, - "1475": 5.76985, - "1480": 5.77563, - "1485": 5.72396, - "1490": 5.72354, - "1495": 5.74538, - "1500": 5.75109, - "1505": 5.72321, - "1510": 5.74832, - "1515": 5.67052, - "1520": 5.70302, - "1525": 5.67385, - "1530": 5.69497, - "1535": 5.68565, - "1540": 5.672, - "1545": 5.7178, - "1550": 5.72274, - "1555": 5.70942, - "1560": 5.65211, - "1565": 5.69926, - "1570": 5.71179, - "1575": 5.6613, - "1580": 5.69275, - "1585": 5.67221, - "1590": 5.66087, - "1595": 5.63673, - "1600": 5.70849, - "1605": 5.64113, - "1610": 5.64353, - "1615": 5.63334, - "1620": 5.65496, - "1625": 5.64982, - "1630": 5.62727, - "1635": 5.67706, - "1640": 5.62761, - "1645": 5.6449, - "1650": 5.63803, - "1655": 5.62499, - "1660": 5.61278, - "1665": 5.60116, - "1670": 5.61214, - "1675": 5.62193, - "1680": 5.56155, - "1685": 5.57098, - "1690": 5.55098, - "1695": 5.55521, - "1700": 5.60178, - "1705": 5.57706, - "1710": 5.58407, - "1715": 5.54721, - "1720": 5.52704, - "1725": 5.56718, - "1730": 5.53148, - "1735": 5.58307, - "1740": 5.52337, - "1745": 5.55772, - "1750": 5.53213, - "1755": 5.5301, - "1760": 5.55304, - "1765": 5.5132, - "1770": 5.522, - "1775": 5.52704, - "1780": 5.53997, - "1785": 5.48896, - "1790": 5.52187, - "1795": 5.52448, - "1800": 5.4698, - "1805": 5.46326, - "1810": 5.47869, - "1815": 5.48464, - "1820": 5.48466, - "1825": 5.48352, - "1830": 5.46909, - "1835": 5.46355, - "1840": 5.46633, - "1845": 5.44723, - "1850": 5.42996, - "1855": 5.4834, - "1860": 5.43502, - "1865": 5.44302, - "1870": 5.43258, - "1875": 5.42823, - "1880": 5.491, - "1885": 5.45039, - "1890": 5.44132, - "1895": 5.38084, - "1900": 5.42123, - "1905": 5.41299, - "1910": 5.43539, - "1915": 5.4013, - "1920": 5.37729, - "1925": 5.4085, - "1930": 5.37579, - "1935": 5.39731, - "1940": 5.3727, - "1945": 5.4174, - "1950": 5.45899, - "1955": 5.39197, - "1960": 5.39342, - "1965": 5.34213, - "1970": 5.34023, - "1975": 5.40413, - "1980": 5.35398, - "1985": 5.37376, - "1990": 5.39658, - "1995": 5.37398, - "2000": 5.38469, - "2005": 5.42838, - "2010": 5.32884, - "2015": 5.32047, - "2020": 5.32991, - "2025": 5.37403, - "2030": 5.31228, - "2035": 5.33119, - "2040": 5.29466, - "2045": 5.38332, - "2050": 5.35716, - "2055": 5.33062, - "2060": 5.32903, - "2065": 5.29751, - "2070": 5.29985, - "2075": 5.32708, - "2080": 5.29709, - "2085": 5.32918, - "2090": 5.24905, - "2095": 5.29587, - "2100": 5.25777, - "2105": 5.28625, - "2110": 5.28042, - "2115": 5.28102, - "2120": 5.2839, - "2125": 5.24699, - "2130": 5.25602, - "2135": 5.25599, - "2140": 5.26607, - "2145": 5.22772, - "2150": 5.24774, - "2155": 5.22588, - "2160": 5.24123, - "2165": 5.22937, - "2170": 5.26626, - "2175": 5.2603, - "2180": 5.24294, - "2185": 5.24675, - "2190": 5.22691, - "2195": 5.20127, - "2200": 5.20409, - "2205": 5.2127, - "2210": 5.25738, - "2215": 5.30103, - "2220": 5.24446, - "2225": 5.2194, - "2230": 5.21789, - "2235": 5.25766, - "2240": 5.16329, - "2245": 5.1607, - "2250": 5.18607, - "2255": 5.19635, - "2260": 5.13701, - "2265": 5.21276, - "2270": 5.14278, - "2275": 5.19722, - "2280": 5.17159, - "2285": 5.18798, - "2290": 5.17456, - "2295": 5.18141, - "2300": 5.17912, - "2305": 5.15551, - "2310": 5.1834, - "2315": 5.12144, - "2320": 5.17039, - "2325": 5.14984, - "2330": 5.15156, - "2335": 5.13195, - "2340": 5.13852, - "2345": 5.18732, - "2350": 5.12945, - "2355": 5.11891, - "2360": 5.10445, - "2365": 5.11898, - "2370": 5.10258, - "2375": 5.11122, - "2380": 5.05395, - "2385": 5.09747, - "2390": 5.11702, - "2395": 5.1322, - "2400": 5.07944, - "2405": 5.06236, - "2410": 5.11554, - "2415": 5.09106, - "2420": 5.10878, - "2425": 5.06863, - "2430": 5.09273, - "2435": 5.08666, - "2440": 5.07515, - "2445": 5.08608, - "2450": 5.04943, - "2455": 5.09523, - "2460": 5.04536, - "2465": 5.08334, - "2470": 5.07644, - "2475": 5.11246, - "2480": 5.02872, - "2485": 5.05906, - "2490": 5.05297, - "2495": 5.04377, - "2500": 5.04447, - "2505": 5.05124, - "2510": 5.0909, - "2515": 5.08005, - "2520": 5.02414, - "2525": 5.03617, - "2530": 5.05281, - "2535": 5.04127, - "2540": 5.04342, - "2545": 5.05498, - "2550": 4.99288, - "2555": 5.05988, - "2560": 5.03403, - "2565": 5.00279, - "2570": 5.02524, - "2575": 4.98811, - "2580": 5.00235, - "2585": 4.98259, - "2590": 5.00195, - "2595": 4.95577, - "2600": 4.99616, - "2605": 5.01565, - "2610": 5.00846, - "2615": 4.9779, - "2620": 4.96, - "2625": 4.99167, - "2630": 4.92069, - "2635": 5.00179, - "2640": 5.00217, - "2645": 4.95857, - "2650": 4.98056, - "2655": 4.97276, - "2660": 4.91658, - "2665": 5.00931, - "2670": 4.95271, - "2675": 4.92627, - "2680": 4.95939, - "2685": 4.9606, - "2690": 4.92299, - "2695": 4.99925, - "2700": 4.90798, - "2705": 4.92161, - "2710": 4.9625, - "2715": 4.94083, - "2720": 4.97062, - "2725": 4.91977, - "2730": 4.9445, - "2735": 4.9369, - "2740": 4.92939, - "2745": 4.89678, - "2750": 4.93832, - "2755": 4.94144, - "2760": 4.94244, - "2765": 4.91315, - "2770": 4.95527, - "2775": 4.90029, - "2780": 4.93753, - "2785": 4.91159, - "2790": 4.93952, - "2795": 4.89812, - "2800": 4.84327, - "2805": 4.89103, - "2810": 4.88284, - "2815": 4.89434, - "2820": 4.93504, - "2825": 4.92479, - "2830": 4.90086, - "2835": 4.90451, - "2840": 4.89553, - "2845": 4.87238, - "2850": 4.90777, - "2855": 4.83628, - "2860": 4.89239, - "2865": 4.90134, - "2870": 4.89048, - "2875": 4.90822, - "2880": 4.82774, - "2885": 4.8758, - "2890": 4.84909, - "2895": 4.88906, - "2900": 4.84436, - "2905": 4.85096, - "2910": 4.84745, - "2915": 4.89554, - "2920": 4.87192, - "2925": 4.84408, - "2930": 4.83304, - "2935": 4.83856, - "2940": 4.8364, - "2945": 4.80087, - "2950": 4.79094, - "2955": 4.79257, - "2960": 4.81394, - "2965": 4.82244, - "2970": 4.83033, - "2975": 4.843, - "2980": 4.78708, - "2985": 4.83546, - "2990": 4.84632, - "2995": 4.79479, - "3000": 4.79957, - "3005": 4.7852, - "3010": 4.81747, - "3015": 4.77707, - "3020": 4.79613, - "3025": 4.80689, - "3030": 4.81521, - "3035": 4.81107, - "3040": 4.83014, - "3045": 4.81253, - "3050": 4.78854, - "3055": 4.79109, - "3060": 4.77291, - "3065": 4.80026, - "3070": 4.82011, - "3075": 4.75177, - "3080": 4.78059, - "3085": 4.7825, - "3090": 4.76596, - "3095": 4.80833, - "3100": 4.79656, - "3105": 4.77177, - "3110": 4.76085, - "3115": 4.71609, - "3120": 4.78235, - "3125": 4.74714, - "3130": 4.75497, - "3135": 4.75435, - "3140": 4.7318, - "3145": 4.71606, - "3150": 4.74842, - "3155": 4.78313, - "3160": 4.765, - "3165": 4.75911, - "3170": 4.7541, - "3175": 4.746, - "3180": 4.73371, - "3185": 4.70655, - "3190": 4.70906, - "3195": 4.70876, - "3200": 4.67795, - "3205": 4.72527, - "3210": 4.67973, - "3215": 4.71138, - "3220": 4.67941, - "3225": 4.71501, - "3230": 4.698, - "3235": 4.73415, - "3240": 4.68214, - "3245": 4.6954, - "3250": 4.64543, - "3255": 4.69551, - "3260": 4.67926, - "3265": 4.72582, - "3270": 4.70744, - "3275": 4.65457, - "3280": 4.68021, - "3285": 4.69583, - "3290": 4.66845, - "3295": 4.67202, - "3300": 4.66858, - "3305": 4.67172, - "3310": 4.66314, - "3315": 4.70829, - "3320": 4.64885, - "3325": 4.65812, - "3330": 4.64245, - "3335": 4.65293, - "3340": 4.62608, - "3345": 4.64548, - "3350": 4.65071, - "3355": 4.65765, - "3360": 4.64823, - "3365": 4.66194, - "3370": 4.63984, - "3375": 4.67722, - "3380": 4.61449, - "3385": 4.62869, - "3390": 4.60608, - "3395": 4.6967, - "3400": 4.64188, - "3405": 4.6721, - "3410": 4.60581, - "3415": 4.55337, - "3420": 4.61467, - "3425": 4.63228, - "3430": 4.66874, - "3435": 4.63419, - "3440": 4.65338, - "3445": 4.60093, - "3450": 4.59889, - "3455": 4.62429, - "3460": 4.58089, - "3465": 4.57689, - "3470": 4.59454, - "3475": 4.60079, - "3480": 4.59374, - "3485": 4.62356, - "3490": 4.60917, - "3495": 4.63221, - "3500": 4.59027, - "3505": 4.59844, - "3510": 4.59797, - "3515": 4.648, - "3520": 4.62554, - "3525": 4.57245, - "3530": 4.58587, - "3535": 4.58174, - "3540": 4.63653, - "3545": 4.56212, - "3550": 4.62056, - "3555": 4.55332, - "3560": 4.62414, - "3565": 4.55473, - "3570": 4.56696, - "3575": 4.53468, - "3580": 4.59878, - "3585": 4.58068, - "3590": 4.51872, - "3595": 4.58848, - "3600": 4.55395, - "3605": 4.53571, - "3610": 4.54008, - "3615": 4.56874, - "3620": 4.61691, - "3625": 4.55023, - "3630": 4.59867, - "3635": 4.50879, - "3640": 4.52782, - "3645": 4.56947, - "3650": 4.53552, - "3655": 4.54665, - "3660": 4.55228, - "3665": 4.58643, - "3670": 4.54047, - "3675": 4.55594, - "3680": 4.57348, - "3685": 4.49418, - "3690": 4.54299, - "3695": 4.49297, - "3700": 4.52866, - "3705": 4.50654, - "3710": 4.51966, - "3715": 4.53, - "3720": 4.50118, - "3725": 4.47886, - "3730": 4.4879, - "3735": 4.50546, - "3740": 4.49399, - "3745": 4.48041, - "3750": 4.51288, - "3755": 4.48915, - "3760": 4.50004, - "3765": 4.47669, - "3770": 4.48984, - "3775": 4.46969, - "3780": 4.45476, - "3785": 4.50898, - "3790": 4.42336, - "3795": 4.4846, - "3800": 4.46028, - "3805": 4.46023, - "3810": 4.42629, - "3815": 4.4806, - "3820": 4.4736, - "3825": 4.4803, - "3830": 4.46747, - "3835": 4.42638, - "3840": 4.52349, - "3845": 4.48225, - "3850": 4.42266, - "3855": 4.46223, - "3860": 4.48001, - "3865": 4.44144, - "3870": 4.50523, - "3875": 4.41439, - "3880": 4.42672, - "3885": 4.44983, - "3890": 4.43819, - "3895": 4.38007, - "3900": 4.43434, - "3905": 4.41283, - "3910": 4.42081, - "3915": 4.42082, - "3920": 4.41329, - "3925": 4.39336, - "3930": 4.41243, - "3935": 4.41903, - "3940": 4.41848, - "3945": 4.39397, - "3950": 4.46098, - "3955": 4.39087, - "3960": 4.43851, - "3965": 4.44901, - "3970": 4.39272, - "3975": 4.40242, - "3980": 4.37236, - "3985": 4.40832, - "3990": 4.40208, - "3995": 4.44335, - "4000": 4.38322, - "4005": 4.37255, - "4010": 4.40982, - "4015": 4.39813, - "4020": 4.43488, - "4025": 4.39111, - "4030": 4.44761, - "4035": 4.40548, - "4040": 4.43553, - "4045": 4.41155, - "4050": 4.40643, - "4055": 4.41393, - "4060": 4.40665, - "4065": 4.41291, - "4070": 4.34904, - "4075": 4.37708, - "4080": 4.35797, - "4085": 4.39736, - "4090": 4.37437, - "4095": 4.35826, - "4100": 4.37323, - "4105": 4.36208, - "4110": 4.32609, - "4115": 4.39421, - "4120": 4.31057, - "4125": 4.31168, - "4130": 4.39302, - "4135": 4.37289, - "4140": 4.31616, - "4145": 4.32788, - "4150": 4.37558, - "4155": 4.29766, - "4160": 4.35633, - "4165": 4.38157, - "4170": 4.32646, - "4175": 4.33285, - "4180": 4.32735, - "4185": 4.31953, - "4190": 4.31017, - "4195": 4.31525, - "4200": 4.31406, - "4205": 4.37, - "4210": 4.32695, - "4215": 4.3562, - "4220": 4.33701, - "4225": 4.32036, - "4230": 4.30579, - "4235": 4.35051, - "4240": 4.30872, - "4245": 4.31564, - "4250": 4.29999, - "4255": 4.31166, - "4260": 4.29019, - "4265": 4.30554, - "4270": 4.29954, - "4275": 4.36276, - "4280": 4.29798, - "4285": 4.33284, - "4290": 4.27741, - "4295": 4.30368, - "4300": 4.32594, - "4305": 4.29066, - "4310": 4.33408, - "4315": 4.3163, - "4320": 4.30571, - "4325": 4.32764, - "4330": 4.26525, - "4335": 4.30418, - "4340": 4.28838, - "4345": 4.23753, - "4350": 4.25927, - "4355": 4.33009, - "4360": 4.30543, - "4365": 4.30411, - "4370": 4.28149, - "4375": 4.24372, - "4380": 4.25559, - "4385": 4.23331, - "4390": 4.30895, - "4395": 4.27518, - "4400": 4.26254, - "4405": 4.23007, - "4410": 4.28048, - "4415": 4.26816, - "4420": 4.24916, - "4425": 4.29252, - "4430": 4.24244, - "4435": 4.29049, - "4440": 4.28601, - "4445": 4.24232, - "4450": 4.20719, - "4455": 4.26016, - "4460": 4.23459, - "4465": 4.25243, - "4470": 4.23841, - "4475": 4.2641, - "4480": 4.24909, - "4485": 4.23389, - "4490": 4.23593, - "4495": 4.17962, - "4500": 4.25444, - "4505": 4.22942, - "4510": 4.23965, - "4515": 4.19566, - "4520": 4.23113, - "4525": 4.19456, - "4530": 4.24001, - "4535": 4.20166, - "4540": 4.21127, - "4545": 4.23188, - "4550": 4.27088, - "4555": 4.2072, - "4560": 4.22378, - "4565": 4.15426, - "4570": 4.21606, - "4575": 4.1941, - "4580": 4.25747, - "4585": 4.22428, - "4590": 4.21266, - "4595": 4.17399, - "4600": 4.16313, - "4605": 4.2045, - "4610": 4.19939, - "4615": 4.24443, - "4620": 4.16447, - "4625": 4.19099, - "4630": 4.20991, - "4635": 4.18208, - "4640": 4.21078, - "4645": 4.20652, - "4650": 4.22758, - "4655": 4.19246, - "4660": 4.18248, - "4665": 4.193, - "4670": 4.23574, - "4675": 4.17989, - "4680": 4.20859, - "4685": 4.19688, - "4690": 4.1723, - "4695": 4.18485, - "4700": 4.16546, - "4705": 4.14067, - "4710": 4.20305, - "4715": 4.19002, - "4720": 4.14737, - "4725": 4.12216, - "4730": 4.17809, - "4735": 4.10178, - "4740": 4.14697, - "4745": 4.18779, - "4750": 4.13615, - "4755": 4.19424, - "4760": 4.1984, - "4765": 4.1461, - "4770": 4.14849, - "4775": 4.14773, - "4780": 4.15523, - "4785": 4.13664, - "4790": 4.19224, - "4795": 4.17628, - "4800": 4.13942, - "4805": 4.17839, - "4810": 4.1375, - "4815": 4.17167, - "4820": 4.12226, - "4825": 4.17474, - "4830": 4.16985, - "4835": 4.14976, - "4840": 4.15298, - "4845": 4.10968, - "4850": 4.17354, - "4855": 4.17639, - "4860": 4.11236, - "4865": 4.13759, - "4870": 4.13215, - "4875": 4.17643, - "4880": 4.1702, - "4885": 4.13029, - "4890": 4.1249, - "4895": 4.12403, - "4900": 4.09958, - "4905": 4.09173, - "4910": 4.09074, - "4915": 4.14665, - "4920": 4.12021, - "4925": 4.08814, - "4930": 4.09778, - "4935": 4.12094, - "4940": 4.04981, - "4945": 4.13369, - "4950": 4.07708, - "4955": 4.15684, - "4960": 4.11652, - "4965": 4.1151, - "4970": 4.09971, - "4975": 4.11736, - "4980": 4.12585, - "4985": 4.12754, - "4990": 4.09005, - "4995": 4.12916, - "5000": 4.05682, - "5005": 4.11701, - "5010": 4.10942, - "5015": 4.07584, - "5020": 4.05201, - "5025": 4.06082, - "5030": 4.10005, - "5035": 4.08177, - "5040": 4.0418, - "5045": 4.11064, - "5050": 4.06425, - "5055": 4.08995, - "5060": 4.03143, - "5065": 4.09666, - "5070": 4.07056, - "5075": 4.12386, - "5080": 4.07795, - "5085": 4.09595, - "5090": 4.07748, - "5095": 4.0424, - "5100": 4.0782, - "5105": 4.0809, - "5110": 4.08612, - "5115": 4.07663, - "5120": 4.09438, - "5125": 4.05976, - "5130": 4.06327, - "5135": 4.0488, - "5140": 4.06922, - "5145": 4.05942, - "5150": 4.07092, - "5155": 4.07553, - "5160": 4.05549, - "5165": 4.09766, - "5170": 3.96642, - "5175": 4.07515, - "5180": 4.03531, - "5185": 4.05861, - "5190": 4.08092, - "5195": 4.04601, - "5200": 4.06577, - "5205": 4.09747, - "5210": 4.01055, - "5215": 4.02373, - "5220": 4.02621, - "5225": 4.02349, - "5230": 4.06271, - "5235": 4.03585, - "5240": 4.02422, - "5245": 4.04177, - "5250": 4.04544, - "5255": 4.03173, - "5260": 4.04798, - "5265": 4.01495, - "5270": 3.98673, - "5275": 4.00519, - "5280": 4.02024, - "5285": 4.04277, - "5290": 4.00304, - "5295": 4.00093, - "5300": 4.02323, - "5305": 4.01012, - "5310": 4.0478, - "5315": 3.99571, - "5320": 4.03864, - "5325": 4.06497, - "5330": 3.99981, - "5335": 4.02122, - "5340": 3.9739, - "5345": 4.01424, - "5350": 4.0246, - "5355": 4.01714, - "5360": 3.9668, - "5365": 3.98455, - "5370": 4.02892, - "5375": 3.99384, - "5380": 3.98952, - "5385": 4.00787, - "5390": 3.99585, - "5395": 3.932, - "5400": 4.02192, - "5405": 3.94401, - "5410": 4.03103, - "5415": 3.94954, - "5420": 3.98108, - "5425": 3.96619, - "5430": 3.97462, - "5435": 4.00917, - "5440": 3.96082, - "5445": 3.96843, - "5450": 3.98078, - "5455": 3.96312, - "5460": 3.97781, - "5465": 4.03343, - "5470": 3.99301, - "5475": 3.92634, - "5480": 4.0001, - "5485": 3.96789, - "5490": 3.99381, - "5495": 3.99755, - "5500": 3.95394, - "5505": 3.9702, - "5510": 4.00139, - "5515": 3.97886, - "5520": 3.95723, - "5525": 4.01089, - "5530": 3.95723, - "5535": 3.99058, - "5540": 3.95888, - "5545": 3.97704, - "5550": 3.97005, - "5555": 3.93134, - "5560": 3.94203, - "5565": 3.98688, - "5570": 3.94409, - "5575": 3.97691, - "5580": 3.95423, - "5585": 3.89232, - "5590": 3.96662, - "5595": 3.91996, - "5600": 3.97099, - "5605": 3.87423, - "5610": 3.96509, - "5615": 3.9629, - "5620": 3.97882, - "5625": 3.95843, - "5630": 3.94884, - "5635": 3.92989, - "5640": 3.95308, - "5645": 3.91537, - "5650": 3.88759, - "5655": 3.91914, - "5660": 3.9101, - "5665": 3.92739, - "5670": 3.91107, - "5675": 3.94487, - "5680": 3.91238, - "5685": 3.92365, - "5690": 3.92517, - "5695": 3.953, - "5700": 3.88996, - "5705": 3.88995, - "5710": 3.87532, - "5715": 3.99623, - "5720": 3.94505, - "5725": 3.89527, - "5730": 3.94792, - "5735": 3.92817, - "5740": 3.92171, - "5745": 3.89897, - "5750": 3.92176, - "5755": 3.94672, - "5760": 3.92632, - "5765": 3.92024, - "5770": 3.95286, - "5775": 3.86965, - "5780": 3.91041, - "5785": 3.91605, - "5790": 3.9236, - "5795": 3.93068, - "5800": 3.86954, - "5805": 3.8764, - "5810": 3.92692, - "5815": 3.89083, - "5820": 3.84021, - "5825": 3.89285, - "5830": 3.85163, - "5835": 3.88292, - "5840": 3.89361, - "5845": 3.91293, - "5850": 3.90508, - "5855": 3.84956, - "5860": 3.87018, - "5865": 3.8979, - "5870": 3.85816, - "5875": 3.89604, - "5880": 3.88075, - "5885": 3.89965, - "5890": 3.90395, - "5895": 3.92339, - "5900": 3.85618, - "5905": 3.92033, - "5910": 3.88782, - "5915": 3.85158, - "5920": 3.88999, - "5925": 3.82174, - "5930": 3.88478, - "5935": 3.86887, - "5940": 3.89924, - "5945": 3.90324, - "5950": 3.88472, - "5955": 3.83758, - "5960": 3.91077, - "5965": 3.85295, - "5970": 3.90592, - "5975": 3.87131, - "5980": 3.94635, - "5985": 3.81828, - "5990": 3.91445, - "5995": 3.82666, - "6000": 3.86389, - "6005": 3.82737, - "6010": 3.84638, - "6015": 3.82528, - "6020": 3.84213, - "6025": 3.8812, - "6030": 3.82864, - "6035": 3.87549, - "6040": 3.85371, - "6045": 3.88892, - "6050": 3.86125, - "6055": 3.84398, - "6060": 3.86538, - "6065": 3.8955, - "6070": 3.844, - "6075": 3.79156, - "6080": 3.86497, - "6085": 3.82767, - "6090": 3.86054, - "6095": 3.85995, - "6100": 3.82399, - "6105": 3.87238, - "6110": 3.80525, - "6115": 3.87931, - "6120": 3.85374, - "6125": 3.85469, - "6130": 3.85122, - "6135": 3.82709, - "6140": 3.8225, - "6145": 3.81264, - "6150": 3.85853, - "6155": 3.83605, - "6160": 3.80232, - "6165": 3.82292, - "6170": 3.81513, - "6175": 3.80691, - "6180": 3.8071, - "6185": 3.84448, - "6190": 3.81178, - "6195": 3.78014, - "6200": 3.80543, - "6205": 3.81219, - "6210": 3.77002, - "6215": 3.82559, - "6220": 3.822, - "6225": 3.82598, - "6230": 3.76955, - "6235": 3.8072, - "6240": 3.73374, - "6245": 3.84624, - "6250": 3.80845, - "6255": 3.8223, - "6260": 3.7948, - "6265": 3.82819, - "6270": 3.75673, - "6275": 3.78492, - "6280": 3.80313, - "6285": 3.78154, - "6290": 3.79976, - "6295": 3.80168, - "6300": 3.80756, - "6305": 3.88253, - "6310": 3.7702, - "6315": 3.7633, - "6320": 3.81817, - "6325": 3.75526, - "6330": 3.82862, - "6335": 3.81943, - "6340": 3.76721, - "6345": 3.82391, - "6350": 3.76718, - "6355": 3.77414, - "6360": 3.75111, - "6365": 3.80986, - "6370": 3.81014, - "6375": 3.78548, - "6380": 3.8065, - "6385": 3.82336, - "6390": 3.78289, - "6395": 3.75935, - "6400": 3.76038, - "6405": 3.83749, - "6410": 3.83127, - "6415": 3.7623, - "6420": 3.82306, - "6425": 3.83219, - "6430": 3.81048, - "6435": 3.77764, - "6440": 3.76108, - "6445": 3.80173, - "6450": 3.73884, - "6455": 3.75156, - "6460": 3.77352, - "6465": 3.80905, - "6470": 3.78701, - "6475": 3.78176, - "6480": 3.81548, - "6485": 3.76414, - "6490": 3.71291, - "6495": 3.81407, - "6500": 3.79809, - "6505": 3.72741, - "6510": 3.7976, - "6515": 3.81938, - "6520": 3.73166, - "6525": 3.80464, - "6530": 3.76853, - "6535": 3.76159, - "6540": 3.82675, - "6545": 3.76261, - "6550": 3.76963, - "6555": 3.75505, - "6560": 3.71108, - "6565": 3.70887, - "6570": 3.7465, - "6575": 3.69338, - "6580": 3.81517, - "6585": 3.76239, - "6590": 3.72546, - "6595": 3.74461, - "6600": 3.73687, - "6605": 3.71668, - "6610": 3.72679, - "6615": 3.76079, - "6620": 3.70966, - "6625": 3.72313, - "6630": 3.72114, - "6635": 3.76232, - "6640": 3.73374, - "6645": 3.75061, - "6650": 3.77922, - "6655": 3.70627, - "6660": 3.73531, - "6665": 3.7573, - "6670": 3.71979, - "6675": 3.74124, - "6680": 3.73477, - "6685": 3.76436, - "6690": 3.74256, - "6695": 3.75545, - "6700": 3.74559, - "6705": 3.72882, - "6710": 3.72913, - "6715": 3.69291, - "6720": 3.77736, - "6725": 3.75737, - "6730": 3.73993, - "6735": 3.74082, - "6740": 3.73806, - "6745": 3.72041, - "6750": 3.74412, - "6755": 3.69337, - "6760": 3.68122, - "6765": 3.74232, - "6770": 3.69625, - "6775": 3.74604, - "6780": 3.70485, - "6785": 3.70942, - "6790": 3.73683, - "6795": 3.69846, - "6800": 3.71752, - "6805": 3.72172, - "6810": 3.73628, - "6815": 3.65876, - "6820": 3.70229, - "6825": 3.72745, - "6830": 3.70872, - "6835": 3.68623, - "6840": 3.67517, - "6845": 3.74818, - "6850": 3.70405, - "6855": 3.73713, - "6860": 3.6695, - "6865": 3.73585, - "6870": 3.6953, - "6875": 3.69781, - "6880": 3.70324, - "6885": 3.67727, - "6890": 3.69236, - "6895": 3.67848, - "6900": 3.68133, - "6905": 3.68771, - "6910": 3.72919, - "6915": 3.73359, - "6920": 3.68934, - "6925": 3.69022, - "6930": 3.68858, - "6935": 3.62056, - "6940": 3.68927, - "6945": 3.67777, - "6950": 3.68038, - "6955": 3.6771, - "6960": 3.68108, - "6965": 3.72225, - "6970": 3.64603, - "6975": 3.72781, - "6980": 3.68459, - "6985": 3.68985, - "6990": 3.7316, - "6995": 3.70495, - "7000": 3.63993, - "7005": 3.71744, - "7010": 3.69223, - "7015": 3.67561, - "7020": 3.72152, - "7025": 3.70969, - "7030": 3.70236, - "7035": 3.65723, - "7040": 3.61488, - "7045": 3.69518, - "7050": 3.71947, - "7055": 3.64991, - "7060": 3.69149, - "7065": 3.74261, - "7070": 3.67108, - "7075": 3.67419, - "7080": 3.71683, - "7085": 3.64191, - "7090": 3.66318, - "7095": 3.63818, - "7100": 3.68341, - "7105": 3.62024, - "7110": 3.68873, - "7115": 3.63797, - "7120": 3.68741, - "7125": 3.63499, - "7130": 3.65311, - "7135": 3.66196, - "7140": 3.66504, - "7145": 3.68183, - "7150": 3.62677, - "7155": 3.69052, - "7160": 3.62415, - "7165": 3.64241, - "7170": 3.68231, - "7175": 3.64603, - "7180": 3.67571, - "7185": 3.70721, - "7190": 3.663, - "7195": 3.66862, - "7200": 3.67265, - "7205": 3.65833, - "7210": 3.68834, - "7215": 3.67282, - "7220": 3.69117, - "7225": 3.66107, - "7230": 3.68593, - "7235": 3.64823, - "7240": 3.64663, - "7245": 3.66574, - "7250": 3.60447, - "7255": 3.62598, - "7260": 3.68023, - "7265": 3.60288, - "7270": 3.63936, - "7275": 3.64805, - "7280": 3.62623, - "7285": 3.65053, - "7290": 3.6735, - "7295": 3.66357, - "7300": 3.62393, - "7305": 3.62784, - "7310": 3.66312, - "7315": 3.67632, - "7320": 3.65015, - "7325": 3.65453, - "7330": 3.62344, - "7335": 3.62574, - "7340": 3.64422, - "7345": 3.60533, - "7350": 3.65727, - "7355": 3.64352, - "7360": 3.61779, - "7365": 3.63578, - "7370": 3.6188, - "7375": 3.59366, - "7380": 3.64743, - "7385": 3.67218, - "7390": 3.65876, - "7395": 3.60688, - "7400": 3.65695, - "7405": 3.64945, - "7410": 3.66151, - "7415": 3.64439, - "7420": 3.63591, - "7425": 3.6844, - "7430": 3.63181, - "7435": 3.61154, - "7440": 3.62564, - "7445": 3.60843, - "7450": 3.57301, - "7455": 3.64772, - "7460": 3.63452, - "7465": 3.63169, - "7470": 3.63744, - "7475": 3.64264, - "7480": 3.61171, - "7485": 3.57567, - "7490": 3.57599, - "7495": 3.5863, - "7500": 3.61565, - "7505": 3.59614, - "7510": 3.55707, - "7515": 3.61683, - "7520": 3.60991, - "7525": 3.56658, - "7530": 3.61196, - "7535": 3.62507, - "7540": 3.61046, - "7545": 3.64639, - "7550": 3.65882, - "7555": 3.58595, - "7560": 3.60212, - "7565": 3.59782, - "7570": 3.60603, - "7575": 3.57351, - "7580": 3.62111, - "7585": 3.60137, - "7590": 3.6026, - "7595": 3.66318, - "7600": 3.6076, - "7605": 3.59626, - "7610": 3.58483, - "7615": 3.58478, - "7620": 3.56787, - "7625": 3.62193, - "7630": 3.60469, - "7635": 3.5928, - "7640": 3.59019, - "7645": 3.62279, - "7650": 3.6259, - "7655": 3.66371, - "7660": 3.5305, - "7665": 3.60545, - "7670": 3.59796, - "7675": 3.58201, - "7680": 3.57701, - "7685": 3.64556, - "7690": 3.59102, - "7695": 3.57063, - "7700": 3.63352, - "7705": 3.58816, - "7710": 3.62048, - "7715": 3.5764, - "7720": 3.65561, - "7725": 3.55706, - "7730": 3.57614, - "7735": 3.61006, - "7740": 3.58168, - "7745": 3.58454, - "7750": 3.57422, - "7755": 3.59202, - "7760": 3.56089, - "7765": 3.58551, - "7770": 3.60104, - "7775": 3.57103, - "7780": 3.55457, - "7785": 3.57713, - "7790": 3.57042, - "7795": 3.58792, - "7800": 3.57997, - "7805": 3.58361, - "7810": 3.60683, - "7815": 3.57773, - "7820": 3.57578, - "7825": 3.61835, - "7830": 3.59192, - "7835": 3.52632, - "7840": 3.6194, - "7845": 3.55538, - "7850": 3.51354, - "7855": 3.56599, - "7860": 3.54645, - "7865": 3.60369, - "7870": 3.54114, - "7875": 3.55695, - "7880": 3.572, - "7885": 3.56229, - "7890": 3.60585, - "7895": 3.59334, - "7900": 3.60641, - "7905": 3.56339, - "7910": 3.58203, - "7915": 3.58298, - "7920": 3.59012, - "7925": 3.5681, - "7930": 3.59927, - "7935": 3.56169, - "7940": 3.60948, - "7945": 3.62723, - "7950": 3.53708, - "7955": 3.54481, - "7960": 3.53124, - "7965": 3.51862, - "7970": 3.52486, - "7975": 3.55975, - "7980": 3.56722, - "7985": 3.54114, - "7990": 3.54399, - "7995": 3.5186, - "8000": 3.57756, - "8005": 3.54643, - "8010": 3.53705, - "8015": 3.53445, - "8020": 3.53111, - "8025": 3.51514, - "8030": 3.54148, - "8035": 3.53478, - "8040": 3.52163, - "8045": 3.57586, - "8050": 3.57789, - "8055": 3.54866, - "8060": 3.5712, - "8065": 3.54757, - "8070": 3.53654, - "8075": 3.52629, - "8080": 3.57467, - "8085": 3.52928, - "8090": 3.53424, - "8095": 3.56313, - "8100": 3.51543, - "8105": 3.54752, - "8110": 3.5453, - "8115": 3.51645, - "8120": 3.52703, - "8125": 3.56437, - "8130": 3.52567, - "8135": 3.53994, - "8140": 3.52104, - "8145": 3.50389, - "8150": 3.52394, - "8155": 3.51178, - "8160": 3.56129, - "8165": 3.54328, - "8170": 3.5116, - "8175": 3.5057, - "8180": 3.57245, - "8185": 3.54733, - "8190": 3.58207, - "8195": 3.55001, - "8200": 3.52156, - "8205": 3.52888, - "8210": 3.53558, - "8215": 3.55713, - "8220": 3.5201, - "8225": 3.51201, - "8230": 3.53756, - "8235": 3.55814, - "8240": 3.54052, - "8245": 3.53652, - "8250": 3.5692, - "8255": 3.51844, - "8260": 3.52912, - "8265": 3.52072, - "8270": 3.52843, - "8275": 3.51526, - "8280": 3.50321, - "8285": 3.52669, - "8290": 3.5272, - "8295": 3.49645, - "8300": 3.51721, - "8305": 3.53958, - "8310": 3.5351, - "8315": 3.50396, - "8320": 3.53046, - "8325": 3.47885, - "8330": 3.44388, - "8335": 3.51457, - "8340": 3.54076, - "8345": 3.49873, - "8350": 3.51134, - "8355": 3.54342, - "8360": 3.51607, - "8365": 3.53716, - "8370": 3.53127, - "8375": 3.48696, - "8380": 3.4848, - "8385": 3.52879, - "8390": 3.49474, - "8395": 3.52721, - "8400": 3.49636, - "8405": 3.51685, - "8410": 3.57651, - "8415": 3.48228, - "8420": 3.45216, - "8425": 3.53401, - "8430": 3.53787, - "8435": 3.47534, - "8440": 3.55163, - "8445": 3.53658, - "8450": 3.50995, - "8455": 3.52875, - "8460": 3.53463, - "8465": 3.4708, - "8470": 3.4929, - "8475": 3.55004, - "8480": 3.47555, - "8485": 3.49487, - "8490": 3.48489, - "8495": 3.48023, - "8500": 3.52888, - "8505": 3.46749, - "8510": 3.54064, - "8515": 3.48982, - "8520": 3.49184, - "8525": 3.42254, - "8530": 3.50181, - "8535": 3.52351, - "8540": 3.47484, - "8545": 3.49944, - "8550": 3.46881, - "8555": 3.53517, - "8560": 3.5346, - "8565": 3.48792, - "8570": 3.48883, - "8575": 3.46414, - "8580": 3.50837, - "8585": 3.52994, - "8590": 3.51956, - "8595": 3.52409, - "8600": 3.50319, - "8605": 3.49079, - "8610": 3.49584, - "8615": 3.49483, - "8620": 3.46525, - "8625": 3.4875, - "8630": 3.49269, - "8635": 3.47742, - "8640": 3.46288, - "8645": 3.52844, - "8650": 3.45936, - "8655": 3.50294, - "8660": 3.51093, - "8665": 3.48996, - "8670": 3.50547, - "8675": 3.47414, - "8680": 3.4685, - "8685": 3.48029, - "8690": 3.51264, - "8695": 3.51367, - "8700": 3.48324, - "8705": 3.45351, - "8710": 3.50031, - "8715": 3.45042, - "8720": 3.52876, - "8725": 3.48819, - "8730": 3.47981, - "8735": 3.51018, - "8740": 3.46013, - "8745": 3.50108, - "8750": 3.50543, - "8755": 3.46564, - "8760": 3.48373, - "8765": 3.43955, - "8770": 3.50951, - "8775": 3.47313, - "8780": 3.45782, - "8785": 3.47628, - "8790": 3.4608, - "8795": 3.49675, - "8800": 3.46402, - "8805": 3.43267, - "8810": 3.45044, - "8815": 3.47281, - "8820": 3.43586, - "8825": 3.46906, - "8830": 3.44494, - "8835": 3.42402, - "8840": 3.4361, - "8845": 3.45772, - "8850": 3.48143, - "8855": 3.46505, - "8860": 3.53187, - "8865": 3.46882, - "8870": 3.44869, - "8875": 3.45286, - "8880": 3.45584, - "8885": 3.44986, - "8890": 3.47298, - "8895": 3.45131, - "8900": 3.47879, - "8905": 3.46796, - "8910": 3.45421, - "8915": 3.44293, - "8920": 3.43345, - "8925": 3.50917, - "8930": 3.49052, - "8935": 3.50073, - "8940": 3.47584, - "8945": 3.47848, - "8950": 3.45717, - "8955": 3.44615, - "8960": 3.43965, - "8965": 3.45818, - "8970": 3.47179, - "8975": 3.42177, - "8980": 3.42266, - "8985": 3.44671, - "8990": 3.50075, - "8995": 3.47255, - "9000": 3.41954, - "9005": 3.46563, - "9010": 3.51573, - "9015": 3.4185, - "9020": 3.43896, - "9025": 3.44768, - "9030": 3.4718, - "9035": 3.37943, - "9040": 3.45501, - "9045": 3.45466, - "9050": 3.49179, - "9055": 3.40312, - "9060": 3.49477, - "9065": 3.51349, - "9070": 3.44713, - "9075": 3.47746, - "9080": 3.47127, - "9085": 3.47459, - "9090": 3.46668, - "9095": 3.42167, - "9100": 3.4227, - "9105": 3.41261, - "9110": 3.45663, - "9115": 3.46481, - "9120": 3.51949, - "9125": 3.44245, - "9130": 3.43654, - "9135": 3.46008, - "9140": 3.47929, - "9145": 3.42408, - "9150": 3.44307, - "9155": 3.45089, - "9160": 3.44998, - "9165": 3.45651, - "9170": 3.47508, - "9175": 3.41133, - "9180": 3.45323, - "9185": 3.41086, - "9190": 3.46875, - "9195": 3.43315, - "9200": 3.44758, - "9205": 3.42373, - "9210": 3.45572, - "9215": 3.39585, - "9220": 3.42327, - "9225": 3.44665, - "9230": 3.37357, - "9235": 3.39456, - "9240": 3.42282, - "9245": 3.40683, - "9250": 3.40791, - "9255": 3.42077, - "9260": 3.39755, - "9265": 3.44216, - "9270": 3.40754, - "9275": 3.42864, - "9280": 3.44334, - "9285": 3.44087, - "9290": 3.45563, - "9295": 3.44456, - "9300": 3.39522, - "9305": 3.42638, - "9310": 3.41593, - "9315": 3.38278, - "9320": 3.3797, - "9325": 3.42046, - "9330": 3.47853, - "9335": 3.38962, - "9340": 3.4706, - "9345": 3.46224, - "9350": 3.42735, - "9355": 3.39326, - "9360": 3.4165, - "9365": 3.41212, - "9370": 3.46155, - "9375": 3.42622, - "9380": 3.36413, - "9385": 3.43469, - "9390": 3.44403, - "9395": 3.45465, - "9400": 3.41582, - "9405": 3.40031, - "9410": 3.43744, - "9415": 3.42574, - "9420": 3.40295, - "9425": 3.42063, - "9430": 3.3935, - "9435": 3.41529, - "9440": 3.40125, - "9445": 3.39961, - "9450": 3.39469, - "9455": 3.4008, - "9460": 3.46489, - "9465": 3.46303, - "9470": 3.40478, - "9475": 3.45335, - "9480": 3.40789, - "9485": 3.3998, - "9490": 3.41154, - "9495": 3.44387, - "9500": 3.40535, - "9505": 3.37735, - "9510": 3.41645, - "9515": 3.41113, - "9520": 3.43045, - "9525": 3.40102, - "9530": 3.40027, - "9535": 3.42216 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 241.22832, - "5": 11.6467, - "10": 11.59177, - "15": 11.54982, - "20": 11.50554, - "25": 11.48401, - "30": 11.47019, - "35": 11.4638, - "40": 11.44621, - "45": 11.45505, - "50": 11.48551, - "55": 11.47505, - "60": 11.46559, - "65": 11.69276, - "70": 11.51491, - "75": 11.58841, - "80": 11.59402, - "85": 11.55505, - "90": 11.57827, - "95": 11.6084, - "100": 11.72328, - "105": 11.84735, - "110": 11.81445, - "115": 12.01469, - "120": 12.27052, - "125": 12.40894, - "130": 12.32306, - "135": 12.6537, - "140": 12.87941, - "145": 12.87274, - "150": 13.17646, - "155": 13.42132, - "160": 13.29203, - "165": 13.33468, - "170": 13.38365, - "175": 13.29143, - "180": 13.37704, - "185": 13.17491, - "190": 13.2207, - "195": 13.0407, - "200": 13.03378, - "205": 12.93499, - "210": 12.93302, - "215": 12.83429, - "220": 12.77504, - "225": 12.71437, - "230": 12.67462, - "235": 12.7241, - "240": 12.78341, - "245": 12.61372, - "250": 12.60968, - "255": 12.49502, - "260": 12.38655, - "265": 12.35372, - "270": 12.32939, - "275": 12.25213, - "280": 12.23412, - "285": 12.25047, - "290": 12.1386, - "295": 12.11066, - "300": 12.11487, - "305": 12.08746, - "310": 12.06842, - "315": 12.13334, - "320": 12.12044, - "325": 12.01351, - "330": 11.97276, - "335": 11.951, - "340": 11.97582, - "345": 11.94178, - "350": 11.90942, - "355": 11.9474, - "360": 11.94231, - "365": 11.91539, - "370": 11.89051, - "375": 11.87871, - "380": 11.8539, - "385": 11.81422, - "390": 11.82072, - "395": 11.85516, - "400": 11.8322, - "405": 11.81286, - "410": 11.81008, - "415": 11.76854, - "420": 11.7721, - "425": 11.7287, - "430": 11.80281, - "435": 11.76948, - "440": 11.78237, - "445": 11.81223, - "450": 11.76024, - "455": 11.83905, - "460": 11.86797, - "465": 11.88193, - "470": 11.94544, - "475": 12.03403, - "480": 11.8718, - "485": 11.96463, - "490": 11.9543, - "495": 11.99738, - "500": 12.06608, - "505": 12.04813, - "510": 12.09706, - "515": 12.14335, - "520": 12.36581, - "525": 12.19115, - "530": 12.1887, - "535": 12.25354, - "540": 12.27902, - "545": 12.32347, - "550": 12.44366, - "555": 12.25807, - "560": 12.22369, - "565": 12.28956, - "570": 12.31572, - "575": 12.28835, - "580": 12.33571, - "585": 12.26567, - "590": 12.30079, - "595": 12.29151, - "600": 12.30023, - "605": 12.45501, - "610": 12.27373, - "615": 12.217, - "620": 12.22334, - "625": 12.21274, - "630": 12.21904, - "635": 12.20277, - "640": 12.25538, - "645": 12.19988, - "650": 12.14026, - "655": 12.14302, - "660": 12.14678, - "665": 12.13972, - "670": 12.11485, - "675": 12.0282, - "680": 12.01901, - "685": 11.98462, - "690": 11.98742, - "695": 11.95917, - "700": 11.92521, - "705": 18.38779, - "710": 11.92438, - "715": 11.8274, - "720": 11.90138, - "725": 11.84998, - "730": 11.83009, - "735": 11.89248, - "740": 11.82364, - "745": 11.91839, - "750": 11.9577, - "755": 11.85056, - "760": 11.90523, - "765": 11.9116, - "770": 11.83717, - "775": 12.05864, - "780": 11.84895, - "785": 11.84375, - "790": 11.86493, - "795": 11.85763, - "800": 11.94365, - "805": 11.86899, - "810": 11.86748, - "815": 11.86393, - "820": 11.87992, - "825": 11.85259, - "830": 11.86886, - "835": 11.8517, - "840": 11.86254, - "845": 11.89508, - "850": 11.85613, - "855": 11.87434, - "860": 11.90703, - "865": 11.83224, - "870": 11.88246, - "875": 11.9305, - "880": 11.96022, - "885": 11.81651, - "890": 12.06642, - "895": 11.92653, - "900": 11.86469, - "905": 12.01767, - "910": 11.89635, - "915": 11.8254, - "920": 11.86106, - "925": 11.88434, - "930": 11.97059, - "935": 12.03718, - "940": 11.87698, - "945": 11.88008, - "950": 12.02071, - "955": 11.84843, - "960": 244.37245, - "965": 12.32084, - "970": 11.86341, - "975": 12.01988, - "980": 11.92166, - "985": 11.85411, - "990": 11.87753, - "995": 11.84786, - "1000": 11.89892, - "1005": 11.99759, - "1010": 11.91045, - "1015": 11.87038, - "1020": 11.85674, - "1025": 11.85567, - "1030": 11.86674, - "1035": 11.92499, - "1040": 11.85969, - "1045": 12.04929, - "1050": 11.82341, - "1055": 11.83111, - "1060": 11.87567, - "1065": 11.84584, - "1070": 11.93603, - "1075": 11.87121, - "1080": 11.85935, - "1085": 11.88667, - "1090": 11.86058, - "1095": 11.86482, - "1100": 11.82375, - "1105": 11.86482, - "1110": 11.89668, - "1115": 11.94941, - "1120": 11.84941, - "1125": 11.94466, - "1130": 11.90846, - "1135": 11.8602, - "1140": 11.86926, - "1145": 11.90365, - "1150": 11.88788, - "1155": 11.81781, - "1160": 11.88464, - "1165": 11.85344, - "1170": 11.8865, - "1175": 11.93361, - "1180": 11.89647, - "1185": 11.9031, - "1190": 11.89287, - "1195": 11.88683, - "1200": 11.85927, - "1205": 11.92471, - "1210": 11.85592, - "1215": 17.4276, - "1220": 11.87359, - "1225": 11.9296, - "1230": 11.95025, - "1235": 11.90738, - "1240": 11.86621, - "1245": 11.98001, - "1250": 12.003, - "1255": 11.91396, - "1260": 11.92279, - "1265": 11.85195, - "1270": 11.87463, - "1275": 11.90307, - "1280": 11.84637, - "1285": 11.95883, - "1290": 11.88039, - "1295": 11.8399, - "1300": 11.81976, - "1305": 11.89766, - "1310": 11.91584, - "1315": 12.12571, - "1320": 12.05556, - "1325": 11.84679, - "1330": 11.94985, - "1335": 11.94039, - "1340": 12.00572, - "1345": 11.98268, - "1350": 12.15927, - "1355": 12.04312, - "1360": 11.98816, - "1365": 11.95737, - "1370": 11.92395, - "1375": 11.89595, - "1380": 11.88635, - "1385": 11.96617, - "1390": 11.87421, - "1395": 12.02833, - "1400": 11.87415, - "1405": 11.85875, - "1410": 11.85419, - "1415": 11.8978, - "1420": 11.86309, - "1425": 11.87505, - "1430": 12.10339, - "1435": 11.88151, - "1440": 12.15068, - "1445": 11.98493, - "1450": 11.95438, - "1455": 12.03808, - "1460": 11.85293, - "1465": 11.93176, - "1470": 11.92246, - "1475": 11.90448, - "1480": 11.98959, - "1485": 11.93685, - "1490": 11.92389, - "1495": 11.95047, - "1500": 11.94526, - "1505": 11.9086, - "1510": 11.95225, - "1515": 11.87405, - "1520": 11.87975, - "1525": 11.88264, - "1530": 12.04989, - "1535": 12.02942, - "1540": 11.93089, - "1545": 11.89376, - "1550": 11.88596, - "1555": 11.95001, - "1560": 11.90239, - "1565": 11.89699, - "1570": 11.91441, - "1575": 11.87813, - "1580": 11.86939, - "1585": 11.8566, - "1590": 11.8665, - "1595": 11.90861, - "1600": 11.90425, - "1605": 11.82248, - "1610": 11.86531, - "1615": 11.8796, - "1620": 11.87587, - "1625": 11.88944, - "1630": 11.88839, - "1635": 11.8307, - "1640": 11.87082, - "1645": 11.84687, - "1650": 11.87887, - "1655": 11.85709, - "1660": 11.85167, - "1665": 11.90284, - "1670": 11.85205, - "1675": 12.00742, - "1680": 11.90754, - "1685": 11.97458, - "1690": 11.97016, - "1695": 11.9189, - "1700": 11.89709, - "1705": 11.88042, - "1710": 11.87879, - "1715": 12.06779, - "1720": 11.98631, - "1725": 12.01044, - "1730": 11.9924, - "1735": 11.87648, - "1740": 11.87455, - "1745": 11.93461, - "1750": 11.90235, - "1755": 11.97053, - "1760": 11.89545, - "1765": 11.8564, - "1770": 11.92635, - "1775": 11.91815, - "1780": 11.91235, - "1785": 11.85546, - "1790": 11.93087, - "1795": 11.91138, - "1800": 11.95901, - "1805": 12.0529, - "1810": 11.98858, - "1815": 12.13997, - "1820": 11.94798, - "1825": 11.97682, - "1830": 11.91244, - "1835": 11.94888, - "1840": 11.93666, - "1845": 11.87312, - "1850": 11.86327, - "1855": 11.94769, - "1860": 12.00187, - "1865": 12.06916, - "1870": 11.99528, - "1875": 11.89416, - "1880": 12.02292, - "1885": 12.04249, - "1890": 11.94094, - "1895": 11.93619, - "1900": 11.95301, - "1905": 11.85793, - "1910": 11.96264, - "1915": 11.92826, - "1920": 11.94216, - "1925": 12.01307, - "1930": 11.98891, - "1935": 11.95834, - "1940": 11.92143, - "1945": 11.98459, - "1950": 16.97099, - "1955": 11.89147, - "1960": 11.94643, - "1965": 11.92486, - "1970": 11.91542, - "1975": 13.09741, - "1980": 12.02148, - "1985": 11.92812, - "1990": 12.01102, - "1995": 11.94891, - "2000": 12.06741, - "2005": 11.94166, - "2010": 11.95871, - "2015": 12.00042, - "2020": 11.99101, - "2025": 11.95463, - "2030": 12.36755, - "2035": 11.96199, - "2040": 11.97863, - "2045": 12.01033, - "2050": 12.0643, - "2055": 11.96928, - "2060": 11.98383, - "2065": 11.92648, - "2070": 11.92379, - "2075": 11.97669, - "2080": 11.95508, - "2085": 11.94472, - "2090": 11.9663, - "2095": 11.93695, - "2100": 11.97178, - "2105": 11.98764, - "2110": 11.9516, - "2115": 11.9215, - "2120": 11.95207, - "2125": 11.95947, - "2130": 11.96722, - "2135": 11.97924, - "2140": 11.88777, - "2145": 11.95546, - "2150": 11.90266, - "2155": 11.97573, - "2160": 11.93275, - "2165": 11.98593, - "2170": 11.9842, - "2175": 12.00145, - "2180": 11.99219, - "2185": 11.96424, - "2190": 11.94313, - "2195": 11.93489, - "2200": 11.94356, - "2205": 12.00157, - "2210": 11.97153, - "2215": 11.9563, - "2220": 12.14117, - "2225": 11.97066, - "2230": 12.00037, - "2235": 11.95279, - "2240": 11.9544, - "2245": 11.97031, - "2250": 11.92229, - "2255": 11.98097, - "2260": 11.96529, - "2265": 11.98619, - "2270": 12.02117, - "2275": 11.94865, - "2280": 12.02569, - "2285": 11.98203, - "2290": 12.10479, - "2295": 11.95346, - "2300": 11.99961, - "2305": 11.96025, - "2310": 11.98746, - "2315": 11.95209, - "2320": 12.02644, - "2325": 11.95369, - "2330": 11.91985, - "2335": 11.93244, - "2340": 11.97061, - "2345": 11.90115, - "2350": 11.99136, - "2355": 12.0541, - "2360": 12.03728, - "2365": 11.95319, - "2370": 11.8917, - "2375": 11.94629, - "2380": 11.9087, - "2385": 11.91696, - "2390": 11.90123, - "2395": 11.87998, - "2400": 12.02954, - "2405": 11.97917, - "2410": 11.98456, - "2415": 11.9575, - "2420": 11.95917, - "2425": 11.95788, - "2430": 11.99944, - "2435": 12.00043, - "2440": 11.91339, - "2445": 11.97889, - "2450": 11.93997, - "2455": 11.91834, - "2460": 11.98321, - "2465": 11.94509, - "2470": 11.93387, - "2475": 11.9562, - "2480": 11.93148, - "2485": 11.94432, - "2490": 11.95477, - "2495": 11.94334, - "2500": 11.9284, - "2505": 11.93757, - "2510": 11.92289, - "2515": 11.97869, - "2520": 11.94858, - "2525": 11.96606, - "2530": 11.90894, - "2535": 11.95425, - "2540": 11.89136, - "2545": 11.94553, - "2550": 11.98026, - "2555": 11.93376, - "2560": 11.94866, - "2565": 11.92767, - "2570": 11.93583, - "2575": 11.97284, - "2580": 11.98911, - "2585": 11.95484, - "2590": 11.96399, - "2595": 11.96211, - "2600": 11.93906, - "2605": 11.9733, - "2610": 12.01872, - "2615": 11.99897, - "2620": 11.90926, - "2625": 11.93248, - "2630": 11.92842, - "2635": 11.94338, - "2640": 11.94678, - "2645": 11.95901, - "2650": 11.9296, - "2655": 12.02405, - "2660": 12.0166, - "2665": 12.01166, - "2670": 11.90595, - "2675": 11.98569, - "2680": 12.0118, - "2685": 11.92029, - "2690": 11.93111, - "2695": 12.00369, - "2700": 11.94818, - "2705": 11.99119, - "2710": 11.93978, - "2715": 11.9296, - "2720": 11.93044, - "2725": 11.94343, - "2730": 12.02248, - "2735": 11.95389, - "2740": 11.94611, - "2745": 11.92776, - "2750": 11.91647, - "2755": 11.9522, - "2760": 11.95012, - "2765": 11.96707, - "2770": 11.94892, - "2775": 11.9867, - "2780": 11.96897, - "2785": 11.97268, - "2790": 12.01936, - "2795": 11.97259, - "2800": 12.01028, - "2805": 11.94892, - "2810": 12.04828, - "2815": 11.93469, - "2820": 11.94568, - "2825": 11.92529, - "2830": 11.97458, - "2835": 11.99475, - "2840": 11.94984, - "2845": 11.93356, - "2850": 12.05796, - "2855": 11.99065, - "2860": 11.96077, - "2865": 11.9377, - "2870": 11.97627, - "2875": 11.97986, - "2880": 11.97201, - "2885": 11.91879, - "2890": 11.93586, - "2895": 12.00661, - "2900": 11.94616, - "2905": 11.94376, - "2910": 11.94168, - "2915": 11.94867, - "2920": 11.99355, - "2925": 11.94779, - "2930": 11.97133, - "2935": 11.96256, - "2940": 11.97787, - "2945": 11.93759, - "2950": 11.91863, - "2955": 11.98973, - "2960": 12.00486, - "2965": 11.91623, - "2970": 11.94846, - "2975": 11.91534, - "2980": 11.97787, - "2985": 12.385, - "2990": 11.88498, - "2995": 11.92173, - "3000": 11.90561, - "3005": 11.86795, - "3010": 11.88075, - "3015": 11.87833, - "3020": 11.98777, - "3025": 11.90078, - "3030": 11.98251, - "3035": 11.92211, - "3040": 11.91067, - "3045": 12.04371, - "3050": 11.91886, - "3055": 11.952, - "3060": 11.90649, - "3065": 11.86917, - "3070": 11.86601, - "3075": 11.92435, - "3080": 11.98092, - "3085": 11.94809, - "3090": 12.20304, - "3095": 11.87329, - "3100": 11.92696, - "3105": 11.85799, - "3110": 11.84125, - "3115": 11.82558, - "3120": 11.87566, - "3125": 11.89426, - "3130": 11.85869, - "3135": 11.92893, - "3140": 11.97022, - "3145": 11.84939, - "3150": 11.9785, - "3155": 11.92499, - "3160": 11.8889, - "3165": 11.87938, - "3170": 11.95555, - "3175": 11.91883, - "3180": 11.85842, - "3185": 11.9325, - "3190": 11.86061, - "3195": 11.90479, - "3200": 11.85963, - "3205": 11.91214, - "3210": 11.9243, - "3215": 11.8472, - "3220": 11.86665, - "3225": 11.89836, - "3230": 11.86299, - "3235": 11.89396, - "3240": 11.87482, - "3245": 11.86774, - "3250": 11.86673, - "3255": 11.88133, - "3260": 11.9014, - "3265": 11.92289, - "3270": 11.98401, - "3275": 11.95198, - "3280": 11.87392, - "3285": 11.89268, - "3290": 11.88963, - "3295": 11.91043, - "3300": 11.89803, - "3305": 11.87011, - "3310": 11.84465, - "3315": 11.84015, - "3320": 11.88334, - "3325": 11.93368, - "3330": 11.83472, - "3335": 11.86862, - "3340": 11.87575, - "3345": 11.94875, - "3350": 11.93528, - "3355": 11.81967, - "3360": 11.95954, - "3365": 11.88024, - "3370": 11.88333, - "3375": 11.85751, - "3380": 11.88742, - "3385": 11.9179, - "3390": 11.83242, - "3395": 11.96084, - "3400": 11.88213, - "3405": 11.86112, - "3410": 11.8407, - "3415": 11.92255, - "3420": 11.91997, - "3425": 11.88372, - "3430": 11.8672, - "3435": 11.85235, - "3440": 11.84935, - "3445": 11.93228, - "3450": 11.85166, - "3455": 11.9026, - "3460": 11.99596, - "3465": 11.88838, - "3470": 11.90065, - "3475": 11.92033, - "3480": 11.87265, - "3485": 11.89235, - "3490": 11.89267, - "3495": 11.97544, - "3500": 11.92819, - "3505": 11.82459, - "3510": 11.90756, - "3515": 11.92021, - "3520": 11.88124, - "3525": 11.86983, - "3530": 11.90548, - "3535": 11.94666, - "3540": 11.93322, - "3545": 11.90904, - "3550": 11.85224, - "3555": 11.886, - "3560": 11.93583, - "3565": 11.87294, - "3570": 11.86107, - "3575": 11.83618, - "3580": 11.94649, - "3585": 11.8886, - "3590": 12.01796, - "3595": 11.86065, - "3600": 11.96008, - "3605": 11.94154, - "3610": 11.91928, - "3615": 11.88551, - "3620": 11.8865, - "3625": 11.86807, - "3630": 11.98152, - "3635": 11.87685, - "3640": 11.89995, - "3645": 11.86485, - "3650": 11.94291, - "3655": 11.86472, - "3660": 11.84946, - "3665": 11.90789, - "3670": 11.86396, - "3675": 12.07226, - "3680": 11.8654, - "3685": 11.90154, - "3690": 11.87282, - "3695": 11.84993, - "3700": 11.92847, - "3705": 11.85848, - "3710": 11.86691, - "3715": 11.93176, - "3720": 11.86996, - "3725": 11.92665, - "3730": 11.90876, - "3735": 11.83597, - "3740": 11.8819, - "3745": 11.90119, - "3750": 11.90765, - "3755": 11.89791, - "3760": 11.91124, - "3765": 11.95606, - "3770": 11.93789, - "3775": 11.87152, - "3780": 11.89754, - "3785": 11.8704, - "3790": 11.88079, - "3795": 11.89363, - "3800": 11.88641, - "3805": 11.87724, - "3810": 11.86303, - "3815": 11.96793, - "3820": 11.97071, - "3825": 11.90678, - "3830": 11.84478, - "3835": 11.86339, - "3840": 11.84359, - "3845": 11.85381, - "3850": 11.89843, - "3855": 11.83659, - "3860": 11.8253, - "3865": 11.82796, - "3870": 11.93815, - "3875": 11.87584, - "3880": 11.85716, - "3885": 11.85848, - "3890": 11.84472, - "3895": 11.85001, - "3900": 11.90416, - "3905": 11.87723, - "3910": 11.90409, - "3915": 11.88375, - "3920": 11.9526, - "3925": 11.8796, - "3930": 11.92607, - "3935": 12.02111, - "3940": 11.89989, - "3945": 11.96829, - "3950": 11.92362, - "3955": 11.91298, - "3960": 11.93391, - "3965": 11.9977, - "3970": 11.91134, - "3975": 11.87698, - "3980": 11.84039, - "3985": 11.8296, - "3990": 11.8824, - "3995": 12.03103, - "4000": 12.53061, - "4005": 11.99032, - "4010": 11.94569, - "4015": 12.02459, - "4020": 12.05098, - "4025": 11.9408, - "4030": 11.9872, - "4035": 11.91882, - "4040": 11.91053, - "4045": 11.94764, - "4050": 11.96252, - "4055": 11.92924, - "4060": 11.95584, - "4065": 11.96477, - "4070": 11.95333, - "4075": 11.95009, - "4080": 11.94196, - "4085": 11.96679, - "4090": 12.09863, - "4095": 12.09521, - "4100": 11.99854, - "4105": 12.05345, - "4110": 11.99127, - "4115": 12.05731, - "4120": 11.95072, - "4125": 12.09249, - "4130": 12.04972, - "4135": 11.892, - "4140": 11.93048, - "4145": 11.92862, - "4150": 12.00088, - "4155": 11.95542, - "4160": 12.01499, - "4165": 11.90691, - "4170": 11.99204, - "4175": 12.02661, - "4180": 12.08762, - "4185": 11.93626, - "4190": 11.96513, - "4195": 11.9247, - "4200": 11.89449, - "4205": 11.95353, - "4210": 11.90984, - "4215": 11.92857, - "4220": 11.99809, - "4225": 12.01358, - "4230": 12.00065, - "4235": 11.95146, - "4240": 12.12674, - "4245": 11.99718, - "4250": 11.98808, - "4255": 11.95388, - "4260": 11.91437, - "4265": 11.97358, - "4270": 11.99013, - "4275": 11.95746, - "4280": 11.9273, - "4285": 11.92873, - "4290": 11.94103, - "4295": 11.93054, - "4300": 11.92986, - "4305": 12.11627, - "4310": 11.95471, - "4315": 11.96985, - "4320": 12.03911, - "4325": 12.01041, - "4330": 11.93084, - "4335": 11.95171, - "4340": 12.03209, - "4345": 11.94503, - "4350": 11.95426, - "4355": 12.08714, - "4360": 12.18212, - "4365": 11.94575, - "4370": 11.96598, - "4375": 12.00939, - "4380": 12.08808, - "4385": 11.9772, - "4390": 12.02704, - "4395": 12.01062, - "4400": 11.94619, - "4405": 11.98609, - "4410": 11.98025, - "4415": 11.99156, - "4420": 11.96913, - "4425": 12.02991, - "4430": 11.98417, - "4435": 12.07654, - "4440": 12.09429, - "4445": 11.9962, - "4450": 11.91032, - "4455": 11.99724, - "4460": 11.94549, - "4465": 11.92313, - "4470": 11.98709, - "4475": 11.9946, - "4480": 12.041, - "4485": 11.98684, - "4490": 12.00793, - "4495": 11.96519, - "4500": 11.91768, - "4505": 11.93855, - "4510": 11.96344, - "4515": 11.93266, - "4520": 11.99772, - "4525": 12.00265, - "4530": 12.00144, - "4535": 11.93099, - "4540": 11.9976, - "4545": 12.04415, - "4550": 11.92104, - "4555": 11.97762, - "4560": 12.05513, - "4565": 12.08413, - "4570": 12.00561, - "4575": 12.03402, - "4580": 12.07435, - "4585": 11.91157, - "4590": 11.93266, - "4595": 12.00575, - "4600": 11.98764, - "4605": 12.07608, - "4610": 11.98608, - "4615": 12.23058, - "4620": 11.96992, - "4625": 11.98931, - "4630": 11.92725, - "4635": 11.94909, - "4640": 11.94336, - "4645": 11.95955, - "4650": 11.99978, - "4655": 11.95199, - "4660": 11.97643, - "4665": 12.03686, - "4670": 12.0499, - "4675": 11.98439, - "4680": 12.00394, - "4685": 11.97515, - "4690": 11.95102, - "4695": 12.07552, - "4700": 11.9222, - "4705": 11.97387, - "4710": 11.99203, - "4715": 11.93004, - "4720": 11.97237, - "4725": 12.00277, - "4730": 12.00835, - "4735": 11.97435, - "4740": 11.98233, - "4745": 11.92423, - "4750": 11.95154, - "4755": 12.02084, - "4760": 11.94378, - "4765": 11.95313, - "4770": 11.92338, - "4775": 11.92352, - "4780": 12.00277, - "4785": 11.94768, - "4790": 11.97296, - "4795": 11.98757, - "4800": 12.26361, - "4805": 11.90736, - "4810": 11.9844, - "4815": 12.04212, - "4820": 11.98762, - "4825": 12.89959, - "4830": 11.9442, - "4835": 12.35106, - "4840": 11.93828, - "4845": 11.92418, - "4850": 11.96443, - "4855": 12.03431, - "4860": 12.04422, - "4865": 11.9646, - "4870": 11.91857, - "4875": 11.95672, - "4880": 11.9198, - "4885": 11.96783, - "4890": 11.94953, - "4895": 11.96692, - "4900": 12.04475, - "4905": 12.05877, - "4910": 12.15039, - "4915": 12.15039, - "4920": 11.95008, - "4925": 11.96843, - "4930": 11.958, - "4935": 11.98531, - "4940": 11.90874, - "4945": 11.95752, - "4950": 12.01284, - "4955": 11.97799, - "4960": 11.99989, - "4965": 11.9277, - "4970": 12.06095, - "4975": 11.95713, - "4980": 12.02719, - "4985": 11.96446, - "4990": 11.92043, - "4995": 11.99522, - "5000": 12.0792, - "5005": 11.95462, - "5010": 18.30939, - "5015": 12.57034, - "5020": 12.13652, - "5025": 11.95064, - "5030": 11.93538, - "5035": 12.01779, - "5040": 11.8639, - "5045": 11.89312, - "5050": 11.93054, - "5055": 11.89904, - "5060": 11.88635, - "5065": 11.89505, - "5070": 11.95957, - "5075": 11.96591, - "5080": 11.85594, - "5085": 11.87343, - "5090": 11.89162, - "5095": 11.9231, - "5100": 11.9213, - "5105": 11.9793, - "5110": 11.92942, - "5115": 11.87025, - "5120": 11.84167, - "5125": 11.92967, - "5130": 11.90523, - "5135": 11.8727, - "5140": 11.95822, - "5145": 11.97795, - "5150": 11.90614, - "5155": 11.88276, - "5160": 11.94188, - "5165": 11.91373, - "5170": 12.01192, - "5175": 11.85511, - "5180": 11.84375, - "5185": 11.88965, - "5190": 11.88542, - "5195": 11.85346, - "5200": 11.94188, - "5205": 11.92082, - "5210": 11.8821, - "5215": 11.92239, - "5220": 11.90608, - "5225": 11.8947, - "5230": 11.88619, - "5235": 11.8948, - "5240": 11.89599, - "5245": 11.88662, - "5250": 11.95415, - "5255": 11.96527, - "5260": 11.89009, - "5265": 11.87997, - "5270": 11.94016, - "5275": 11.89138, - "5280": 11.90447, - "5285": 11.86453, - "5290": 11.90845, - "5295": 11.89373, - "5300": 11.96084, - "5305": 12.00505, - "5310": 11.87874, - "5315": 11.94047, - "5320": 11.90115, - "5325": 11.8657, - "5330": 11.98456, - "5335": 11.89142, - "5340": 11.94056, - "5345": 11.88326, - "5350": 12.02941, - "5355": 11.94937, - "5360": 11.84158, - "5365": 11.85236, - "5370": 11.89414, - "5375": 11.92681, - "5380": 11.89983, - "5385": 11.93247, - "5390": 11.88545, - "5395": 11.85963, - "5400": 11.87187, - "5405": 11.92558, - "5410": 11.94364, - "5415": 11.9087, - "5420": 11.86332, - "5425": 11.92767, - "5430": 11.87425, - "5435": 11.91049, - "5440": 11.87699, - "5445": 11.93171, - "5450": 11.90161, - "5455": 11.921, - "5460": 11.88038, - "5465": 11.91315, - "5470": 11.89728, - "5475": 11.95689, - "5480": 11.98965, - "5485": 11.91576, - "5490": 11.89757, - "5495": 11.93064, - "5500": 11.88252, - "5505": 11.96073, - "5510": 11.86654, - "5515": 11.87886, - "5520": 11.90936, - "5525": 12.03373, - "5530": 11.90318, - "5535": 11.92154, - "5540": 11.90086, - "5545": 11.89022, - "5550": 11.90225, - "5555": 11.83513, - "5560": 11.91062, - "5565": 11.87125, - "5570": 11.87145, - "5575": 11.86357, - "5580": 11.91841, - "5585": 11.92436, - "5590": 11.9023, - "5595": 11.86709, - "5600": 11.91375, - "5605": 11.90872, - "5610": 11.8916, - "5615": 11.95578, - "5620": 11.89294, - "5625": 11.90784, - "5630": 11.92391, - "5635": 11.89956, - "5640": 11.89869, - "5645": 11.91776, - "5650": 11.9431, - "5655": 11.89517, - "5660": 11.88968, - "5665": 11.89529, - "5670": 11.91051, - "5675": 11.91888, - "5680": 11.90991, - "5685": 11.93985, - "5690": 11.90708, - "5695": 11.8876, - "5700": 11.95923, - "5705": 11.93355, - "5710": 11.87364, - "5715": 11.9268, - "5720": 11.98226, - "5725": 11.87678, - "5730": 11.83368, - "5735": 11.89468, - "5740": 11.90674, - "5745": 11.88476, - "5750": 11.86646, - "5755": 11.88929, - "5760": 11.85649, - "5765": 11.85565, - "5770": 11.93646, - "5775": 11.90704, - "5780": 12.04897, - "5785": 11.91885, - "5790": 11.90414, - "5795": 11.92795, - "5800": 11.9484, - "5805": 11.9947, - "5810": 11.88562, - "5815": 11.89893, - "5820": 11.86069, - "5825": 11.85602, - "5830": 11.90577, - "5835": 11.90369, - "5840": 11.95291, - "5845": 11.93547, - "5850": 11.89776, - "5855": 11.89365, - "5860": 11.88809, - "5865": 11.89502, - "5870": 11.90093, - "5875": 11.89463, - "5880": 11.85877, - "5885": 11.91775, - "5890": 11.9362, - "5895": 11.90238, - "5900": 11.89416, - "5905": 11.9161, - "5910": 11.91617, - "5915": 11.89704, - "5920": 11.86193, - "5925": 11.94942, - "5930": 11.85147, - "5935": 11.87033, - "5940": 11.9311, - "5945": 11.96348, - "5950": 11.96932, - "5955": 11.90137, - "5960": 11.87563, - "5965": 11.86128, - "5970": 11.99512, - "5975": 11.92846, - "5980": 11.83738, - "5985": 11.88075, - "5990": 11.89265, - "5995": 11.92537, - "6000": 11.88009, - "6005": 11.9523, - "6010": 11.93509, - "6015": 11.89766, - "6020": 11.88045, - "6025": 11.87641, - "6030": 246.60413, - "6035": 12.33879, - "6040": 11.91607, - "6045": 11.95709, - "6050": 11.93381, - "6055": 11.91355, - "6060": 11.91286, - "6065": 11.97819, - "6070": 11.93373, - "6075": 11.85049, - "6080": 11.96747, - "6085": 11.93318, - "6090": 11.93239, - "6095": 11.8622, - "6100": 11.88525, - "6105": 11.97899, - "6110": 11.91577, - "6115": 11.92755, - "6120": 11.92296, - "6125": 11.99725, - "6130": 11.97753, - "6135": 11.92108, - "6140": 11.91607, - "6145": 11.9071, - "6150": 11.92499, - "6155": 11.91611, - "6160": 12.01604, - "6165": 11.89838, - "6170": 11.90254, - "6175": 11.96493, - "6180": 11.84452, - "6185": 11.91052, - "6190": 11.8712, - "6195": 11.90582, - "6200": 11.90605, - "6205": 11.98397, - "6210": 11.92035, - "6215": 11.96579, - "6220": 11.99275, - "6225": 11.88749, - "6230": 11.89369, - "6235": 11.95748, - "6240": 11.93057, - "6245": 11.94912, - "6250": 11.9372, - "6255": 11.90439, - "6260": 11.92527, - "6265": 11.95201, - "6270": 11.9095, - "6275": 11.97821, - "6280": 11.94458, - "6285": 11.90287, - "6290": 11.89278, - "6295": 11.96073, - "6300": 11.90554, - "6305": 11.88653, - "6310": 11.8962, - "6315": 11.93036, - "6320": 11.95396, - "6325": 11.94894, - "6330": 12.04569, - "6335": 11.88055, - "6340": 11.91066, - "6345": 11.89024, - "6350": 11.89994, - "6355": 11.92221, - "6360": 11.92333, - "6365": 11.91761, - "6370": 11.97313, - "6375": 11.90689, - "6380": 12.08922, - "6385": 11.94942, - "6390": 11.91702, - "6395": 11.90139, - "6400": 11.89012, - "6405": 11.9541, - "6410": 12.00044, - "6415": 11.89967, - "6420": 11.86695, - "6425": 11.87294, - "6430": 11.89524, - "6435": 11.94881, - "6440": 11.91361, - "6445": 11.91243, - "6450": 11.90246, - "6455": 11.88301, - "6460": 11.94133, - "6465": 11.95353, - "6470": 11.93545, - "6475": 11.91767, - "6480": 11.904, - "6485": 11.97366, - "6490": 11.9268, - "6495": 11.92497, - "6500": 12.05293, - "6505": 11.83715, - "6510": 11.86732, - "6515": 11.90038, - "6520": 11.86776, - "6525": 11.86971, - "6530": 11.85789, - "6535": 11.88616, - "6540": 11.85825, - "6545": 11.82803, - "6550": 11.89596, - "6555": 11.89246, - "6560": 11.87827, - "6565": 11.87369, - "6570": 11.88103, - "6575": 11.86696, - "6580": 11.90165, - "6585": 11.85113, - "6590": 11.85101, - "6595": 11.80896, - "6600": 11.90596, - "6605": 11.87406, - "6610": 11.8658, - "6615": 11.86475, - "6620": 11.88848, - "6625": 11.85675, - "6630": 11.84722, - "6635": 11.83752, - "6640": 11.8855, - "6645": 11.91332, - "6650": 11.86288, - "6655": 11.89588, - "6660": 11.8071, - "6665": 11.84093, - "6670": 11.88653, - "6675": 11.88047, - "6680": 11.87018, - "6685": 11.8411, - "6690": 11.82244, - "6695": 11.86596, - "6700": 11.85423, - "6705": 11.86228, - "6710": 11.86517, - "6715": 11.87189, - "6720": 11.84138, - "6725": 11.88097, - "6730": 11.90906, - "6735": 11.91578, - "6740": 11.88058, - "6745": 11.88169, - "6750": 12.03575, - "6755": 11.84511, - "6760": 11.84038, - "6765": 11.83499, - "6770": 11.87927, - "6775": 11.81349, - "6780": 13.01048, - "6785": 11.81032, - "6790": 11.93614, - "6795": 11.97801, - "6800": 11.86, - "6805": 11.83039, - "6810": 11.8441, - "6815": 11.89187, - "6820": 11.87841, - "6825": 11.86012, - "6830": 11.83442, - "6835": 11.85081, - "6840": 11.83799, - "6845": 11.82691, - "6850": 11.89092, - "6855": 11.82022, - "6860": 11.8279, - "6865": 11.79814, - "6870": 11.83217, - "6875": 11.90136, - "6880": 11.85295, - "6885": 11.84058, - "6890": 11.84482, - "6895": 11.82768, - "6900": 11.88337, - "6905": 11.84656, - "6910": 11.90272, - "6915": 11.8005, - "6920": 11.93804, - "6925": 12.00166, - "6930": 11.88293, - "6935": 11.9479, - "6940": 11.85228, - "6945": 11.86242, - "6950": 11.83582, - "6955": 11.81523, - "6960": 11.75894, - "6965": 11.81699, - "6970": 11.85282, - "6975": 11.84727, - "6980": 11.84729, - "6985": 12.01189, - "6990": 11.86887, - "6995": 11.88713, - "7000": 11.85612, - "7005": 11.86648, - "7010": 11.8888, - "7015": 11.84573, - "7020": 11.77395, - "7025": 11.85096, - "7030": 11.86323, - "7035": 11.84315, - "7040": 11.82293, - "7045": 11.81241, - "7050": 11.85808, - "7055": 11.86593, - "7060": 11.87475, - "7065": 11.90707, - "7070": 11.9358, - "7075": 11.84297, - "7080": 11.80853, - "7085": 11.88178, - "7090": 11.87836, - "7095": 11.85532, - "7100": 11.89414, - "7105": 11.85379, - "7110": 11.89642, - "7115": 11.85858, - "7120": 11.90327, - "7125": 11.89711, - "7130": 11.89177, - "7135": 11.88659, - "7140": 11.85757, - "7145": 11.87756, - "7150": 11.88577, - "7155": 11.86153, - "7160": 11.92297, - "7165": 11.88396, - "7170": 11.85778, - "7175": 11.91483, - "7180": 11.86232, - "7185": 11.87476, - "7190": 11.8982, - "7195": 11.88516, - "7200": 11.88158, - "7205": 11.88444, - "7210": 11.89206, - "7215": 11.87279, - "7220": 11.90742, - "7225": 11.85079, - "7230": 11.8483, - "7235": 11.90312, - "7240": 11.87181, - "7245": 11.91535, - "7250": 11.87908, - "7255": 11.92293, - "7260": 11.84549, - "7265": 11.8901, - "7270": 11.84322, - "7275": 11.848, - "7280": 11.8967, - "7285": 11.89986, - "7290": 11.95382, - "7295": 11.90753, - "7300": 11.86218, - "7305": 11.85436, - "7310": 11.85753, - "7315": 11.9134, - "7320": 11.90034, - "7325": 11.83407, - "7330": 11.85974, - "7335": 11.90032, - "7340": 11.88835, - "7345": 11.88443, - "7350": 11.85147, - "7355": 11.86003, - "7360": 11.88911, - "7365": 11.88721, - "7370": 11.94597, - "7375": 11.88507, - "7380": 11.8675, - "7385": 11.88615, - "7390": 11.85493, - "7395": 11.9078, - "7400": 11.89976, - "7405": 11.94755, - "7410": 11.86216, - "7415": 11.81832, - "7420": 11.89699, - "7425": 11.90201, - "7430": 11.88324, - "7435": 11.84242, - "7440": 11.89387, - "7445": 11.85554, - "7450": 11.927, - "7455": 11.89196, - "7460": 11.93241, - "7465": 11.89671, - "7470": 11.8633, - "7475": 11.85785, - "7480": 11.86619, - "7485": 11.90047, - "7490": 11.93453, - "7495": 11.89595, - "7500": 11.92255, - "7505": 11.86705, - "7510": 11.86492, - "7515": 11.83778, - "7520": 12.43308, - "7525": 11.94046, - "7530": 12.11911, - "7535": 11.95645, - "7540": 12.01144, - "7545": 11.94459, - "7550": 12.00989, - "7555": 11.95308, - "7560": 12.02894, - "7565": 12.00926, - "7570": 11.88032, - "7575": 11.94986, - "7580": 11.94673, - "7585": 11.92777, - "7590": 11.96311, - "7595": 11.90291, - "7600": 11.96776, - "7605": 11.91009, - "7610": 11.98945, - "7615": 11.943, - "7620": 11.97203, - "7625": 11.87696, - "7630": 11.92313, - "7635": 11.9056, - "7640": 11.89922, - "7645": 11.93063, - "7650": 11.89735, - "7655": 11.93078, - "7660": 11.95494, - "7665": 11.91011, - "7670": 11.97093, - "7675": 11.97514, - "7680": 11.93177, - "7685": 11.8992, - "7690": 11.94571, - "7695": 11.92277, - "7700": 11.94906, - "7705": 11.92727, - "7710": 11.93604, - "7715": 11.92305, - "7720": 11.93766, - "7725": 11.95622, - "7730": 11.90603, - "7735": 11.91132, - "7740": 11.97695, - "7745": 11.96601, - "7750": 11.88967, - "7755": 11.93644, - "7760": 11.96688, - "7765": 11.92672, - "7770": 23.39259, - "7775": 23.06567, - "7780": 11.93112, - "7785": 11.93477, - "7790": 11.94106, - "7795": 11.94556, - "7800": 12.0002, - "7805": 11.97342, - "7810": 11.95163, - "7815": 11.96208, - "7820": 11.96513, - "7825": 11.93368, - "7830": 11.91708, - "7835": 11.89017, - "7840": 11.94549, - "7845": 11.96002, - "7850": 11.95829, - "7855": 11.92186, - "7860": 11.93832, - "7865": 11.889, - "7870": 11.96191, - "7875": 12.05703, - "7880": 11.97288, - "7885": 11.91666, - "7890": 11.93728, - "7895": 11.96047, - "7900": 11.9818, - "7905": 11.92242, - "7910": 11.97684, - "7915": 11.91154, - "7920": 11.96828, - "7925": 11.94506, - "7930": 11.93465, - "7935": 11.90216, - "7940": 11.91383, - "7945": 11.91481, - "7950": 11.96693, - "7955": 11.94446, - "7960": 11.92358, - "7965": 11.94155, - "7970": 11.95822, - "7975": 12.03469, - "7980": 11.94102, - "7985": 11.94681, - "7990": 11.92459, - "7995": 11.92763, - "8000": 11.96299, - "8005": 11.9788, - "8010": 11.96826, - "8015": 12.02982, - "8020": 11.94329, - "8025": 11.98105, - "8030": 12.01501, - "8035": 11.96502, - "8040": 11.97586, - "8045": 11.96948, - "8050": 11.92611, - "8055": 11.93414, - "8060": 11.93961, - "8065": 11.9262, - "8070": 11.9178, - "8075": 11.90325, - "8080": 11.93833, - "8085": 11.97936, - "8090": 11.99724, - "8095": 11.94796, - "8100": 11.9625, - "8105": 11.94798, - "8110": 11.92353, - "8115": 11.96357, - "8120": 11.92451, - "8125": 11.89352, - "8130": 11.97563, - "8135": 11.97236, - "8140": 11.9723, - "8145": 11.92641, - "8150": 11.89834, - "8155": 11.94876, - "8160": 11.95465, - "8165": 11.95874, - "8170": 11.93402, - "8175": 11.96745, - "8180": 11.91172, - "8185": 11.91331, - "8190": 11.95504, - "8195": 11.94346, - "8200": 11.95192, - "8205": 11.9973, - "8210": 11.95023, - "8215": 12.03521, - "8220": 11.96486, - "8225": 11.95464, - "8230": 11.96151, - "8235": 11.95994, - "8240": 11.97909, - "8245": 11.92928, - "8250": 11.92518, - "8255": 11.94881, - "8260": 11.907, - "8265": 11.93185, - "8270": 11.9211, - "8275": 11.86366, - "8280": 12.00914, - "8285": 11.97086, - "8290": 11.98208, - "8295": 11.92309, - "8300": 11.94129, - "8305": 11.99302, - "8310": 11.97601, - "8315": 11.88862, - "8320": 11.96454, - "8325": 11.89961, - "8330": 11.99534, - "8335": 11.91687, - "8340": 11.96466, - "8345": 11.93152, - "8350": 11.94368, - "8355": 11.92235, - "8360": 11.99578, - "8365": 11.90045, - "8370": 11.91744, - "8375": 11.92667, - "8380": 11.90428, - "8385": 11.94828, - "8390": 11.93507, - "8395": 11.9473, - "8400": 11.94267, - "8405": 11.93414, - "8410": 11.90959, - "8415": 11.92941, - "8420": 11.91201, - "8425": 11.91625, - "8430": 11.9332, - "8435": 11.99456, - "8440": 11.8869, - "8445": 11.90729, - "8450": 11.93362, - "8455": 11.96619, - "8460": 12.01359, - "8465": 11.9429, - "8470": 11.99594, - "8475": 11.95465, - "8480": 11.92489, - "8485": 11.92415, - "8490": 11.97388, - "8495": 11.89913, - "8500": 11.95945, - "8505": 11.91567, - "8510": 11.91482, - "8515": 11.93548, - "8520": 11.95743, - "8525": 11.94743, - "8530": 12.42097, - "8535": 11.9272, - "8540": 12.09436, - "8545": 12.04967, - "8550": 11.9651, - "8555": 12.03857, - "8560": 11.97265, - "8565": 11.91082, - "8570": 11.95406, - "8575": 11.94802, - "8580": 11.9942, - "8585": 11.96288, - "8590": 11.95701, - "8595": 11.97786, - "8600": 11.89715, - "8605": 11.93644, - "8610": 11.98611, - "8615": 11.91557, - "8620": 11.92076, - "8625": 11.96113, - "8630": 11.99266, - "8635": 11.93916, - "8640": 12.02781, - "8645": 11.99006, - "8650": 11.91164, - "8655": 11.91924, - "8660": 11.95194, - "8665": 12.00021, - "8670": 11.90972, - "8675": 11.96086, - "8680": 11.95175, - "8685": 11.95495, - "8690": 12.00198, - "8695": 12.07659, - "8700": 11.96371, - "8705": 11.91845, - "8710": 11.97745, - "8715": 11.93805, - "8720": 11.9173, - "8725": 11.91035, - "8730": 12.01393, - "8735": 11.98447, - "8740": 11.97475, - "8745": 11.96291, - "8750": 11.9361, - "8755": 11.96838, - "8760": 11.93695, - "8765": 12.00162, - "8770": 11.92599, - "8775": 12.0012, - "8780": 12.03738, - "8785": 11.94909, - "8790": 11.90577, - "8795": 11.97012, - "8800": 11.93035, - "8805": 11.99893, - "8810": 11.94421, - "8815": 11.98191, - "8820": 11.99062, - "8825": 11.92267, - "8830": 11.95194, - "8835": 11.937, - "8840": 11.97075, - "8845": 11.95007, - "8850": 12.02522, - "8855": 11.94712, - "8860": 11.96728, - "8865": 11.89285, - "8870": 11.94189, - "8875": 11.92065, - "8880": 11.98822, - "8885": 11.98285, - "8890": 11.99582, - "8895": 11.96596, - "8900": 11.94354, - "8905": 11.95473, - "8910": 11.99259, - "8915": 11.96618, - "8920": 11.93587, - "8925": 11.99413, - "8930": 12.00638, - "8935": 11.93, - "8940": 11.95031, - "8945": 11.91928, - "8950": 11.9941, - "8955": 11.94031, - "8960": 11.96914, - "8965": 11.95062, - "8970": 11.95268, - "8975": 12.03161, - "8980": 11.97245, - "8985": 12.01027, - "8990": 11.9446, - "8995": 11.96843, - "9000": 11.9429, - "9005": 11.94091, - "9010": 11.93667, - "9015": 11.95344, - "9020": 11.93207, - "9025": 11.91998, - "9030": 11.92651, - "9035": 11.97131, - "9040": 11.92008, - "9045": 11.9777, - "9050": 11.93287, - "9055": 11.96682, - "9060": 11.982, - "9065": 11.9763, - "9070": 11.92703, - "9075": 11.95149, - "9080": 11.94863, - "9085": 11.92217, - "9090": 11.92326, - "9095": 11.9586, - "9100": 11.93403, - "9105": 11.97708, - "9110": 11.97248, - "9115": 11.91899, - "9120": 11.98175, - "9125": 12.0043, - "9130": 11.98361, - "9135": 11.95811, - "9140": 11.89116, - "9145": 11.92833, - "9150": 11.96999, - "9155": 11.95682, - "9160": 11.93898, - "9165": 11.98676, - "9170": 11.96776, - "9175": 11.91735, - "9180": 11.96488, - "9185": 11.93801, - "9190": 11.93829, - "9195": 11.96444, - "9200": 11.91924, - "9205": 11.99554, - "9210": 11.91977, - "9215": 11.99739, - "9220": 11.92053, - "9225": 11.93702, - "9230": 11.95815, - "9235": 12.05346, - "9240": 11.9596, - "9245": 11.97173, - "9250": 11.94092, - "9255": 11.94632, - "9260": 12.00354, - "9265": 11.96854, - "9270": 11.91621, - "9275": 11.94709, - "9280": 11.93375, - "9285": 11.92465, - "9290": 11.93047, - "9295": 11.93184, - "9300": 11.95538, - "9305": 11.96102, - "9310": 11.93874, - "9315": 11.94123, - "9320": 11.95854, - "9325": 11.98961, - "9330": 11.87394, - "9335": 11.97986, - "9340": 12.02583, - "9345": 11.94202, - "9350": 12.00113, - "9355": 11.97405, - "9360": 11.96746, - "9365": 11.96018, - "9370": 11.9475, - "9375": 11.94327, - "9380": 11.92135, - "9385": 12.01574, - "9390": 11.95494, - "9395": 11.93529, - "9400": 11.96463, - "9405": 11.9807, - "9410": 11.92926, - "9415": 11.95919, - "9420": 11.94796, - "9425": 11.94261, - "9430": 11.94968, - "9435": 11.9655, - "9440": 11.94016, - "9445": 11.98541, - "9450": 11.94602, - "9455": 11.96365, - "9460": 11.9884, - "9465": 11.93962, - "9470": 11.93471, - "9475": 11.91073, - "9480": 11.92557, - "9485": 11.93537, - "9490": 11.97267, - "9495": 11.93521, - "9500": 11.92542, - "9505": 12.00627, - "9510": 11.9749, - "9515": 11.97511, - "9520": 11.88493, - "9525": 11.91739, - "9530": 11.92418, - "9535": 11.97024 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json deleted file mode 100644 index f486950e5a2..00000000000 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,11492 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 13.89756, - "5": 13.89155, - "10": 13.85814, - "15": 13.84947, - "20": 13.74128, - "25": 13.71269, - "30": 13.39136, - "35": 13.32418, - "40": 13.23329, - "45": 13.12045, - "50": 12.53632, - "55": 12.35058, - "60": 12.17187, - "65": 12.01029, - "70": 11.83519, - "75": 11.55823, - "80": 11.30557, - "85": 11.11711, - "90": 10.96045, - "95": 10.79835, - "100": 10.58719, - "105": 10.45871, - "110": 10.23985, - "115": 10.03197, - "120": 9.88087, - "125": 9.74001, - "130": 9.64895, - "135": 9.58316, - "140": 9.34895, - "145": 9.3363, - "150": 9.17736, - "155": 9.11162, - "160": 9.02957, - "165": 8.91504, - "170": 8.86399, - "175": 8.82531, - "180": 8.68067, - "185": 8.72019, - "190": 8.59287, - "195": 8.59803, - "200": 8.48665, - "205": 8.39681, - "210": 8.35424, - "215": 8.40636, - "220": 8.27837, - "225": 8.29496, - "230": 8.27773, - "235": 8.20463, - "240": 8.15385, - "245": 8.1344, - "250": 8.06891, - "255": 8.08354, - "260": 7.97761, - "265": 7.96264, - "270": 7.91745, - "275": 7.9055, - "280": 7.89502, - "285": 7.91233, - "290": 7.858, - "295": 7.84326, - "300": 7.73922, - "305": 7.73479, - "310": 7.6998, - "315": 7.6959, - "320": 7.68835, - "325": 7.60857, - "330": 7.59888, - "335": 7.57833, - "340": 7.62257, - "345": 7.51187, - "350": 7.5063, - "355": 7.43406, - "360": 7.53414, - "365": 7.45759, - "370": 7.49186, - "375": 7.43607, - "380": 7.41292, - "385": 7.41117, - "390": 7.42986, - "395": 7.36781, - "400": 7.30747, - "405": 7.31834, - "410": 7.30943, - "415": 7.29421, - "420": 7.2965, - "425": 7.26158, - "430": 7.20979, - "435": 7.22197, - "440": 7.18512, - "445": 7.1687, - "450": 7.12181, - "455": 7.14062, - "460": 7.11041, - "465": 7.10497, - "470": 7.07645, - "475": 7.09742, - "480": 6.97587, - "485": 7.03312, - "490": 6.99478, - "495": 6.9692, - "500": 6.91435, - "505": 6.94713, - "510": 6.92309, - "515": 6.88853, - "520": 6.88024, - "525": 6.87529, - "530": 6.88311, - "535": 6.8642, - "540": 6.78769, - "545": 6.8252, - "550": 6.84568, - "555": 6.86869, - "560": 6.81372, - "565": 6.74969, - "570": 6.76579, - "575": 6.77872, - "580": 6.69766, - "585": 6.71359, - "590": 6.65449, - "595": 6.64792, - "600": 6.67016, - "605": 6.65924, - "610": 6.63641, - "615": 6.68438, - "620": 6.60355, - "625": 6.57203, - "630": 6.56964, - "635": 6.60732, - "640": 6.59738, - "645": 6.5815, - "650": 6.62582, - "655": 6.62475, - "660": 6.53171, - "665": 6.52224, - "670": 6.47146, - "675": 6.57058, - "680": 6.53989, - "685": 6.49695, - "690": 6.47037, - "695": 6.43685, - "700": 6.43121, - "705": 6.4313, - "710": 6.46058, - "715": 6.46842, - "720": 6.35254, - "725": 6.40344, - "730": 6.39123, - "735": 6.41174, - "740": 6.34886, - "745": 6.31567, - "750": 6.37227, - "755": 6.29068, - "760": 6.30783, - "765": 6.32016, - "770": 6.31539, - "775": 6.3051, - "780": 6.27484, - "785": 6.28635, - "790": 6.25066, - "795": 6.24498, - "800": 6.22595, - "805": 6.30241, - "810": 6.16125, - "815": 6.18921, - "820": 6.19984, - "825": 6.20878, - "830": 6.21184, - "835": 6.16547, - "840": 6.13918, - "845": 6.18907, - "850": 6.14544, - "855": 6.14245, - "860": 6.12573, - "865": 6.14471, - "870": 6.103, - "875": 6.14755, - "880": 6.09503, - "885": 6.08625, - "890": 6.14906, - "895": 6.03612, - "900": 6.06033, - "905": 6.07119, - "910": 6.04765, - "915": 6.02795, - "920": 6.01922, - "925": 6.00762, - "930": 6.04202, - "935": 6.03448, - "940": 5.96552, - "945": 6.00691, - "950": 6.02802, - "955": 5.9757, - "960": 5.9732, - "965": 5.8947, - "970": 5.93848, - "975": 5.94046, - "980": 5.91694, - "985": 5.91057, - "990": 5.96163, - "995": 5.87028, - "1000": 5.89819, - "1005": 5.85552, - "1010": 5.89001, - "1015": 5.91011, - "1020": 5.82121, - "1025": 5.81525, - "1030": 5.82852, - "1035": 5.91121, - "1040": 5.83477, - "1045": 5.80641, - "1050": 5.84029, - "1055": 5.82471, - "1060": 5.77657, - "1065": 5.75965, - "1070": 5.80228, - "1075": 5.78852, - "1080": 5.77993, - "1085": 5.79347, - "1090": 5.7642, - "1095": 5.77727, - "1100": 5.73679, - "1105": 5.71252, - "1110": 5.76864, - "1115": 5.69994, - "1120": 5.64073, - "1125": 5.65212, - "1130": 5.71653, - "1135": 5.67194, - "1140": 5.66144, - "1145": 5.65572, - "1150": 5.68319, - "1155": 5.64543, - "1160": 5.63371, - "1165": 5.67226, - "1170": 5.65589, - "1175": 5.62136, - "1180": 5.63006, - "1185": 5.6181, - "1190": 5.60413, - "1195": 5.59825, - "1200": 5.54202, - "1205": 5.65572, - "1210": 5.51312, - "1215": 5.55359, - "1220": 5.63431, - "1225": 5.51403, - "1230": 5.56754, - "1235": 5.521, - "1240": 5.55808, - "1245": 5.52886, - "1250": 5.51046, - "1255": 5.50279, - "1260": 5.50208, - "1265": 5.47964, - "1270": 5.44537, - "1275": 5.52448, - "1280": 5.45447, - "1285": 5.4682, - "1290": 5.43648, - "1295": 5.46181, - "1300": 5.46016, - "1305": 5.43278, - "1310": 5.38271, - "1315": 5.44073, - "1320": 5.42393, - "1325": 5.3568, - "1330": 5.41966, - "1335": 5.39498, - "1340": 5.44678, - "1345": 5.4046, - "1350": 5.3745, - "1355": 5.36722, - "1360": 5.37555, - "1365": 5.38819, - "1370": 5.31687, - "1375": 5.3257, - "1380": 5.37435, - "1385": 5.33822, - "1390": 5.32907, - "1395": 5.35996, - "1400": 5.34708, - "1405": 5.32768, - "1410": 5.30321, - "1415": 5.26874, - "1420": 5.31115, - "1425": 5.3045, - "1430": 5.33954, - "1435": 5.24914, - "1440": 5.27894, - "1445": 5.31118, - "1450": 5.28087, - "1455": 5.30455, - "1460": 5.26455, - "1465": 5.26355, - "1470": 5.29615, - "1475": 5.27116, - "1480": 5.26692, - "1485": 5.21939, - "1490": 5.21283, - "1495": 5.23155, - "1500": 5.23275, - "1505": 5.20436, - "1510": 5.22447, - "1515": 5.15502, - "1520": 5.1852, - "1525": 5.15413, - "1530": 5.17452, - "1535": 5.16098, - "1540": 5.16276, - "1545": 5.19593, - "1550": 5.1989, - "1555": 5.18478, - "1560": 5.1253, - "1565": 5.15973, - "1570": 5.17281, - "1575": 5.1468, - "1580": 5.16002, - "1585": 5.14495, - "1590": 5.12815, - "1595": 5.09691, - "1600": 5.17173, - "1605": 5.09626, - "1610": 5.10506, - "1615": 5.09978, - "1620": 5.1145, - "1625": 5.10983, - "1630": 5.08211, - "1635": 5.12902, - "1640": 5.09565, - "1645": 5.08916, - "1650": 5.08067, - "1655": 5.06625, - "1660": 5.05546, - "1665": 5.04609, - "1670": 5.06711, - "1675": 5.06871, - "1680": 5.00775, - "1685": 5.01672, - "1690": 4.99799, - "1695": 5.00065, - "1700": 5.03983, - "1705": 5.01824, - "1710": 5.00629, - "1715": 4.97587, - "1720": 4.97437, - "1725": 4.9984, - "1730": 4.95014, - "1735": 5.02541, - "1740": 4.95266, - "1745": 4.97461, - "1750": 4.95639, - "1755": 4.97133, - "1760": 4.98489, - "1765": 4.93728, - "1770": 4.93343, - "1775": 4.9432, - "1780": 4.96314, - "1785": 4.91574, - "1790": 4.93944, - "1795": 4.93848, - "1800": 4.88725, - "1805": 4.87771, - "1810": 4.8976, - "1815": 4.89801, - "1820": 4.8872, - "1825": 4.89371, - "1830": 4.8786, - "1835": 4.87542, - "1840": 4.87209, - "1845": 4.85811, - "1850": 4.83484, - "1855": 4.89133, - "1860": 4.84322, - "1865": 4.85108, - "1870": 4.82648, - "1875": 4.83877, - "1880": 4.89485, - "1885": 4.84392, - "1890": 4.8281, - "1895": 4.77339, - "1900": 4.81423, - "1905": 4.81232, - "1910": 4.82991, - "1915": 4.79768, - "1920": 4.78308, - "1925": 4.79277, - "1930": 4.76544, - "1935": 4.7941, - "1940": 4.75875, - "1945": 4.80214, - "1950": 4.83843, - "1955": 4.77731, - "1960": 4.76768, - "1965": 4.72596, - "1970": 4.73388, - "1975": 4.7973, - "1980": 4.73036, - "1985": 4.74162, - "1990": 4.78353, - "1995": 4.74959, - "2000": 4.76948, - "2005": 4.80113, - "2010": 4.70951, - "2015": 4.69715, - "2020": 4.71284, - "2025": 4.75821, - "2030": 4.68831, - "2035": 4.71528, - "2040": 4.67772, - "2045": 4.76255, - "2050": 4.74404, - "2055": 4.7077, - "2060": 4.70614, - "2065": 4.66526, - "2070": 4.67653, - "2075": 4.69507, - "2080": 4.66174, - "2085": 4.69911, - "2090": 4.61739, - "2095": 4.64746, - "2100": 4.61666, - "2105": 4.64633, - "2110": 4.64123, - "2115": 4.65336, - "2120": 4.64559, - "2125": 4.61059, - "2130": 4.61466, - "2135": 4.62745, - "2140": 4.6232, - "2145": 4.58124, - "2150": 4.60983, - "2155": 4.57956, - "2160": 4.60382, - "2165": 4.58415, - "2170": 4.61387, - "2175": 4.60275, - "2180": 4.59531, - "2185": 4.60788, - "2190": 4.58246, - "2195": 4.55672, - "2200": 4.55346, - "2205": 4.56383, - "2210": 4.6146, - "2215": 4.64276, - "2220": 4.59912, - "2225": 4.57263, - "2230": 4.56854, - "2235": 4.61797, - "2240": 4.51401, - "2245": 4.5176, - "2250": 4.52905, - "2255": 4.54117, - "2260": 4.48536, - "2265": 4.56489, - "2270": 4.49655, - "2275": 4.55547, - "2280": 4.51075, - "2285": 4.53333, - "2290": 4.52269, - "2295": 4.52707, - "2300": 4.53228, - "2305": 4.49287, - "2310": 4.53148, - "2315": 4.46329, - "2320": 4.51121, - "2325": 4.49336, - "2330": 4.49351, - "2335": 4.47787, - "2340": 4.48626, - "2345": 4.52525, - "2350": 4.4674, - "2355": 4.47173, - "2360": 4.44099, - "2365": 4.44682, - "2370": 4.44716, - "2375": 4.44199, - "2380": 4.39487, - "2385": 4.43475, - "2390": 4.43071, - "2395": 4.46719, - "2400": 4.42074, - "2405": 4.40081, - "2410": 4.44955, - "2415": 4.42055, - "2420": 4.4293, - "2425": 4.39783, - "2430": 4.42084, - "2435": 4.40291, - "2440": 4.39501, - "2445": 4.40808, - "2450": 4.38239, - "2455": 4.4178, - "2460": 4.36606, - "2465": 4.41327, - "2470": 4.40023, - "2475": 4.41776, - "2480": 4.34092, - "2485": 4.37423, - "2490": 4.37838, - "2495": 4.35662, - "2500": 4.36528, - "2505": 4.37219, - "2510": 4.41251, - "2515": 4.40356, - "2520": 4.34516, - "2525": 4.36214, - "2530": 4.36786, - "2535": 4.36686, - "2540": 4.36548, - "2545": 4.37687, - "2550": 4.30337, - "2555": 4.37244, - "2560": 4.35158, - "2565": 4.30393, - "2570": 4.33393, - "2575": 4.30697, - "2580": 4.30582, - "2585": 4.29358, - "2590": 4.31272, - "2595": 4.28154, - "2600": 4.29867, - "2605": 4.31115, - "2610": 4.32106, - "2615": 4.27768, - "2620": 4.26935, - "2625": 4.30437, - "2630": 4.22434, - "2635": 4.30369, - "2640": 4.30012, - "2645": 4.2581, - "2650": 4.28639, - "2655": 4.26647, - "2660": 4.21474, - "2665": 4.30436, - "2670": 4.26382, - "2675": 4.2306, - "2680": 4.25227, - "2685": 4.25736, - "2690": 4.22986, - "2695": 4.28379, - "2700": 4.19098, - "2705": 4.23853, - "2710": 4.25092, - "2715": 4.23481, - "2720": 4.24356, - "2725": 4.2225, - "2730": 4.22941, - "2735": 4.22363, - "2740": 4.20346, - "2745": 4.18765, - "2750": 4.21101, - "2755": 4.22237, - "2760": 4.22902, - "2765": 4.18298, - "2770": 4.23755, - "2775": 4.17706, - "2780": 4.21186, - "2785": 4.19469, - "2790": 4.21736, - "2795": 4.18988, - "2800": 4.1159, - "2805": 4.16613, - "2810": 4.17076, - "2815": 4.15389, - "2820": 4.1969, - "2825": 4.19241, - "2830": 4.16864, - "2835": 4.17046, - "2840": 4.16148, - "2845": 4.14967, - "2850": 4.16619, - "2855": 4.11805, - "2860": 4.14572, - "2865": 4.17023, - "2870": 4.14096, - "2875": 4.1596, - "2880": 4.08582, - "2885": 4.14242, - "2890": 4.11503, - "2895": 4.15452, - "2900": 4.09735, - "2905": 4.11101, - "2910": 4.10798, - "2915": 4.14914, - "2920": 4.12546, - "2925": 4.10099, - "2930": 4.08522, - "2935": 4.07896, - "2940": 4.09225, - "2945": 4.06113, - "2950": 4.03479, - "2955": 4.03763, - "2960": 4.04955, - "2965": 4.0643, - "2970": 4.08593, - "2975": 4.0941, - "2980": 4.03102, - "2985": 4.07394, - "2990": 4.08923, - "2995": 4.03231, - "3000": 4.0436, - "3005": 4.02568, - "3010": 4.06747, - "3015": 4.02305, - "3020": 4.03992, - "3025": 4.02491, - "3030": 4.0567, - "3035": 4.04059, - "3040": 4.0544, - "3045": 4.04677, - "3050": 4.017, - "3055": 4.00507, - "3060": 3.9904, - "3065": 4.02281, - "3070": 4.03826, - "3075": 3.97211, - "3080": 4.0011, - "3085": 4.00548, - "3090": 4.00887, - "3095": 4.02745, - "3100": 4.01465, - "3105": 3.99035, - "3110": 3.99124, - "3115": 3.92509, - "3120": 4.00505, - "3125": 3.94183, - "3130": 3.96987, - "3135": 3.96132, - "3140": 3.95209, - "3145": 3.93524, - "3150": 3.96949, - "3155": 3.96213, - "3160": 3.96255, - "3165": 3.96146, - "3170": 3.96456, - "3175": 3.93165, - "3180": 3.93784, - "3185": 3.90234, - "3190": 3.92455, - "3195": 3.9116, - "3200": 3.89013, - "3205": 3.92029, - "3210": 3.89711, - "3215": 3.90569, - "3220": 3.89706, - "3225": 3.91097, - "3230": 3.89895, - "3235": 3.91122, - "3240": 3.88912, - "3245": 3.88902, - "3250": 3.84407, - "3255": 3.89259, - "3260": 3.88283, - "3265": 3.92603, - "3270": 3.9052, - "3275": 3.85915, - "3280": 3.88232, - "3285": 3.86652, - "3290": 3.86681, - "3295": 3.83806, - "3300": 3.85349, - "3305": 3.86048, - "3310": 3.85872, - "3315": 3.89673, - "3320": 3.85179, - "3325": 3.84353, - "3330": 3.82539, - "3335": 3.86213, - "3340": 3.81824, - "3345": 3.83129, - "3350": 3.85901, - "3355": 3.8452, - "3360": 3.83241, - "3365": 3.83682, - "3370": 3.82265, - "3375": 3.85232, - "3380": 3.79563, - "3385": 3.81353, - "3390": 3.79143, - "3395": 3.86888, - "3400": 3.83997, - "3405": 3.86197, - "3410": 3.77529, - "3415": 3.72916, - "3420": 3.80048, - "3425": 3.81237, - "3430": 3.84497, - "3435": 3.80796, - "3440": 3.8267, - "3445": 3.7742, - "3450": 3.78787, - "3455": 3.80217, - "3460": 3.78265, - "3465": 3.75891, - "3470": 3.77341, - "3475": 3.77638, - "3480": 3.77988, - "3485": 3.80588, - "3490": 3.76958, - "3495": 3.80315, - "3500": 3.77047, - "3505": 3.77239, - "3510": 3.75092, - "3515": 3.80896, - "3520": 3.79879, - "3525": 3.76372, - "3530": 3.75322, - "3535": 3.76209, - "3540": 3.81796, - "3545": 3.72915, - "3550": 3.79201, - "3555": 3.72604, - "3560": 3.78622, - "3565": 3.7451, - "3570": 3.74254, - "3575": 3.71868, - "3580": 3.77066, - "3585": 3.76174, - "3590": 3.68853, - "3595": 3.76509, - "3600": 3.71336, - "3605": 3.71948, - "3610": 3.70916, - "3615": 3.74868, - "3620": 3.7837, - "3625": 3.71964, - "3630": 3.76519, - "3635": 3.68617, - "3640": 3.7093, - "3645": 3.74263, - "3650": 3.69638, - "3655": 3.72074, - "3660": 3.72832, - "3665": 3.74694, - "3670": 3.71178, - "3675": 3.71065, - "3680": 3.72416, - "3685": 3.67473, - "3690": 3.6936, - "3695": 3.68528, - "3700": 3.70814, - "3705": 3.67651, - "3710": 3.68493, - "3715": 3.6842, - "3720": 3.66563, - "3725": 3.64716, - "3730": 3.64883, - "3735": 3.68782, - "3740": 3.6732, - "3745": 3.66354, - "3750": 3.6757, - "3755": 3.66351, - "3760": 3.67285, - "3765": 3.66004, - "3770": 3.6516, - "3775": 3.63831, - "3780": 3.62453, - "3785": 3.6765, - "3790": 3.60163, - "3795": 3.64291, - "3800": 3.63275, - "3805": 3.62032, - "3810": 3.59475, - "3815": 3.63585, - "3820": 3.64099, - "3825": 3.6535, - "3830": 3.63864, - "3835": 3.59938, - "3840": 3.67685, - "3845": 3.65895, - "3850": 3.60064, - "3855": 3.60428, - "3860": 3.65711, - "3865": 3.60867, - "3870": 3.6721, - "3875": 3.58596, - "3880": 3.58212, - "3885": 3.60502, - "3890": 3.60969, - "3895": 3.5558, - "3900": 3.61685, - "3905": 3.59135, - "3910": 3.5772, - "3915": 3.5862, - "3920": 3.57131, - "3925": 3.56751, - "3930": 3.58005, - "3935": 3.5821, - "3940": 3.57511, - "3945": 3.56965, - "3950": 3.61887, - "3955": 3.57531, - "3960": 3.60735, - "3965": 3.58853, - "3970": 3.56735, - "3975": 3.56709, - "3980": 3.5304, - "3985": 3.60527, - "3990": 3.58124, - "3995": 3.60753, - "4000": 3.55811, - "4005": 3.54162, - "4010": 3.58376, - "4015": 3.58398, - "4020": 3.58355, - "4025": 3.57409, - "4030": 3.62855, - "4035": 3.57033, - "4040": 3.5882, - "4045": 3.60161, - "4050": 3.57522, - "4055": 3.57403, - "4060": 3.5888, - "4065": 3.58382, - "4070": 3.51488, - "4075": 3.55887, - "4080": 3.53108, - "4085": 3.54596, - "4090": 3.54584, - "4095": 3.53161, - "4100": 3.55106, - "4105": 3.53794, - "4110": 3.51736, - "4115": 3.56348, - "4120": 3.49648, - "4125": 3.49769, - "4130": 3.55149, - "4135": 3.54373, - "4140": 3.49112, - "4145": 3.51351, - "4150": 3.55497, - "4155": 3.48797, - "4160": 3.54539, - "4165": 3.56451, - "4170": 3.50424, - "4175": 3.50239, - "4180": 3.4998, - "4185": 3.5138, - "4190": 3.5011, - "4195": 3.50044, - "4200": 3.49424, - "4205": 3.53032, - "4210": 3.51921, - "4215": 3.52292, - "4220": 3.53088, - "4225": 3.50168, - "4230": 3.49756, - "4235": 3.52008, - "4240": 3.49249, - "4245": 3.49542, - "4250": 3.48848, - "4255": 3.50707, - "4260": 3.4676, - "4265": 3.48819, - "4270": 3.50473, - "4275": 3.53933, - "4280": 3.48997, - "4285": 3.50947, - "4290": 3.48405, - "4295": 3.48692, - "4300": 3.52631, - "4305": 3.48704, - "4310": 3.51358, - "4315": 3.50638, - "4320": 3.50379, - "4325": 3.51699, - "4330": 3.45992, - "4335": 3.49232, - "4340": 3.50354, - "4345": 3.43189, - "4350": 3.44845, - "4355": 3.52327, - "4360": 3.48083, - "4365": 3.47079, - "4370": 3.47624, - "4375": 3.44129, - "4380": 3.44296, - "4385": 3.42527, - "4390": 3.49048, - "4395": 3.47699, - "4400": 3.47442, - "4405": 3.41723, - "4410": 3.48335, - "4415": 3.44899, - "4420": 3.44113, - "4425": 3.47273, - "4430": 3.44742, - "4435": 3.49082, - "4440": 3.48522, - "4445": 3.43744, - "4450": 3.3974, - "4455": 3.4624, - "4460": 3.43415, - "4465": 3.45284, - "4470": 3.42199, - "4475": 3.45352, - "4480": 3.44375, - "4485": 3.43643, - "4490": 3.43453, - "4495": 3.38677, - "4500": 3.45384, - "4505": 3.43515, - "4510": 3.44292, - "4515": 3.40605, - "4520": 3.43888, - "4525": 3.40731, - "4530": 3.44131, - "4535": 3.3963, - "4540": 3.42067, - "4545": 3.43217, - "4550": 3.47418, - "4555": 3.39854, - "4560": 3.42732, - "4565": 3.37837, - "4570": 3.41702, - "4575": 3.41117, - "4580": 3.45362, - "4585": 3.42636, - "4590": 3.42388, - "4595": 3.39853, - "4600": 3.39686, - "4605": 3.42144, - "4610": 3.41286, - "4615": 3.45309, - "4620": 3.39526, - "4625": 3.42534, - "4630": 3.4127, - "4635": 3.39195, - "4640": 3.4264, - "4645": 3.41975, - "4650": 3.43542, - "4655": 3.40687, - "4660": 3.39737, - "4665": 3.41231, - "4670": 3.446, - "4675": 3.40423, - "4680": 3.42886, - "4685": 3.42464, - "4690": 3.39897, - "4695": 3.38, - "4700": 3.3729, - "4705": 3.35029, - "4710": 3.40571, - "4715": 3.39222, - "4720": 3.38774, - "4725": 3.35968, - "4730": 3.39519, - "4735": 3.32069, - "4740": 3.36458, - "4745": 3.40698, - "4750": 3.36053, - "4755": 3.39053, - "4760": 3.41421, - "4765": 3.36022, - "4770": 3.36502, - "4775": 3.36135, - "4780": 3.37362, - "4785": 3.374, - "4790": 3.41163, - "4795": 3.39334, - "4800": 3.34583, - "4805": 3.41139, - "4810": 3.35086, - "4815": 3.38903, - "4820": 3.34814, - "4825": 3.40406, - "4830": 3.38314, - "4835": 3.3693, - "4840": 3.38086, - "4845": 3.32726, - "4850": 3.39372, - "4855": 3.39679, - "4860": 3.32727, - "4865": 3.36392, - "4870": 3.34896, - "4875": 3.39123, - "4880": 3.39974, - "4885": 3.35153, - "4890": 3.36191, - "4895": 3.35318, - "4900": 3.32971, - "4905": 3.33008, - "4910": 3.32861, - "4915": 3.37524, - "4920": 3.35807, - "4925": 3.31242, - "4930": 3.34376, - "4935": 3.3273, - "4940": 3.28784, - "4945": 3.36034, - "4950": 3.29629, - "4955": 3.40365, - "4960": 3.3479, - "4965": 3.34204, - "4970": 3.33369, - "4975": 3.34388, - "4980": 3.36573, - "4985": 3.35352, - "4990": 3.33542, - "4995": 3.3795, - "5000": 3.30893, - "5005": 3.35715, - "5010": 3.36146, - "5015": 3.30923, - "5020": 3.28653, - "5025": 3.31605, - "5030": 3.32648, - "5035": 3.32963, - "5040": 3.30481, - "5045": 3.34994, - "5050": 3.30693, - "5055": 3.32632, - "5060": 3.28843, - "5065": 3.33396, - "5070": 3.33431, - "5075": 3.34337, - "5080": 3.31868, - "5085": 3.34518, - "5090": 3.32323, - "5095": 3.29022, - "5100": 3.32026, - "5105": 3.32744, - "5110": 3.3329, - "5115": 3.3038, - "5120": 3.34196, - "5125": 3.3184, - "5130": 3.31738, - "5135": 3.30105, - "5140": 3.3111, - "5145": 3.31125, - "5150": 3.32063, - "5155": 3.31567, - "5160": 3.31039, - "5165": 3.34534, - "5170": 3.23105, - "5175": 3.31877, - "5180": 3.28445, - "5185": 3.30691, - "5190": 3.32611, - "5195": 3.30561, - "5200": 3.31019, - "5205": 3.34654, - "5210": 3.28506, - "5215": 3.2874, - "5220": 3.28219, - "5225": 3.28677, - "5230": 3.32011, - "5235": 3.27975, - "5240": 3.27349, - "5245": 3.29646, - "5250": 3.3023, - "5255": 3.28615, - "5260": 3.31039, - "5265": 3.27007, - "5270": 3.25412, - "5275": 3.25534, - "5280": 3.28407, - "5285": 3.30874, - "5290": 3.2589, - "5295": 3.27448, - "5300": 3.27858, - "5305": 3.26656, - "5310": 3.32809, - "5315": 3.25873, - "5320": 3.30633, - "5325": 3.3111, - "5330": 3.27899, - "5335": 3.28833, - "5340": 3.23016, - "5345": 3.28336, - "5350": 3.28737, - "5355": 3.28737, - "5360": 3.23407, - "5365": 3.25011, - "5370": 3.28855, - "5375": 3.26985, - "5380": 3.24418, - "5385": 3.28394, - "5390": 3.28221, - "5395": 3.20448, - "5400": 3.30114, - "5405": 3.21525, - "5410": 3.29188, - "5415": 3.22284, - "5420": 3.25707, - "5425": 3.23689, - "5430": 3.24779, - "5435": 3.2811, - "5440": 3.21236, - "5445": 3.24176, - "5450": 3.24576, - "5455": 3.22991, - "5460": 3.25196, - "5465": 3.29692, - "5470": 3.27194, - "5475": 3.20136, - "5480": 3.28214, - "5485": 3.24325, - "5490": 3.26633, - "5495": 3.27183, - "5500": 3.22718, - "5505": 3.23914, - "5510": 3.28342, - "5515": 3.27035, - "5520": 3.23742, - "5525": 3.28473, - "5530": 3.22923, - "5535": 3.26258, - "5540": 3.25366, - "5545": 3.26198, - "5550": 3.24962, - "5555": 3.22875, - "5560": 3.22306, - "5565": 3.26845, - "5570": 3.22989, - "5575": 3.26435, - "5580": 3.23553, - "5585": 3.18594, - "5590": 3.24664, - "5595": 3.2105, - "5600": 3.25488, - "5605": 3.17461, - "5610": 3.2604, - "5615": 3.25606, - "5620": 3.2609, - "5625": 3.25214, - "5630": 3.24091, - "5635": 3.21924, - "5640": 3.24377, - "5645": 3.20743, - "5650": 3.2076, - "5655": 3.20542, - "5660": 3.20971, - "5665": 3.21069, - "5670": 3.20056, - "5675": 3.22863, - "5680": 3.19922, - "5685": 3.20573, - "5690": 3.2077, - "5695": 3.24414, - "5700": 3.19628, - "5705": 3.18515, - "5710": 3.17855, - "5715": 3.28582, - "5720": 3.2496, - "5725": 3.2002, - "5730": 3.24085, - "5735": 3.22905, - "5740": 3.22477, - "5745": 3.20281, - "5750": 3.23329, - "5755": 3.23832, - "5760": 3.22288, - "5765": 3.22651, - "5770": 3.25303, - "5775": 3.19712, - "5780": 3.21565, - "5785": 3.21756, - "5790": 3.22715, - "5795": 3.22463, - "5800": 3.16888, - "5805": 3.18332, - "5810": 3.22432, - "5815": 3.20302, - "5820": 3.16241, - "5825": 3.20754, - "5830": 3.1647, - "5835": 3.17395, - "5840": 3.20628, - "5845": 3.217, - "5850": 3.21594, - "5855": 3.15148, - "5860": 3.17119, - "5865": 3.20009, - "5870": 3.16136, - "5875": 3.20014, - "5880": 3.19456, - "5885": 3.19488, - "5890": 3.21776, - "5895": 3.23301, - "5900": 3.1895, - "5905": 3.21986, - "5910": 3.20185, - "5915": 3.17464, - "5920": 3.1915, - "5925": 3.15681, - "5930": 3.19135, - "5935": 3.19128, - "5940": 3.2051, - "5945": 3.21968, - "5950": 3.20213, - "5955": 3.16275, - "5960": 3.22598, - "5965": 3.17666, - "5970": 3.21828, - "5975": 3.18539, - "5980": 3.25556, - "5985": 3.14035, - "5990": 3.2373, - "5995": 3.15341, - "6000": 3.17562, - "6005": 3.15642, - "6010": 3.15958, - "6015": 3.16383, - "6020": 3.17057, - "6025": 3.20846, - "6030": 3.14683, - "6035": 3.20108, - "6040": 3.18034, - "6045": 3.19784, - "6050": 3.19841, - "6055": 3.17123, - "6060": 3.18513, - "6065": 3.20946, - "6070": 3.16514, - "6075": 3.13204, - "6080": 3.19182, - "6085": 3.15022, - "6090": 3.18799, - "6095": 3.18454, - "6100": 3.13968, - "6105": 3.18911, - "6110": 3.13194, - "6115": 3.18032, - "6120": 3.17268, - "6125": 3.17817, - "6130": 3.16826, - "6135": 3.16641, - "6140": 3.16491, - "6145": 3.14203, - "6150": 3.17849, - "6155": 3.14973, - "6160": 3.12836, - "6165": 3.15943, - "6170": 3.14366, - "6175": 3.14619, - "6180": 3.14564, - "6185": 3.18694, - "6190": 3.15491, - "6195": 3.12582, - "6200": 3.15218, - "6205": 3.14598, - "6210": 3.10092, - "6215": 3.15518, - "6220": 3.1544, - "6225": 3.17142, - "6230": 3.10668, - "6235": 3.14063, - "6240": 3.08394, - "6245": 3.18223, - "6250": 3.14309, - "6255": 3.15773, - "6260": 3.14125, - "6265": 3.15597, - "6270": 3.10065, - "6275": 3.12382, - "6280": 3.13503, - "6285": 3.11829, - "6290": 3.14415, - "6295": 3.15298, - "6300": 3.15403, - "6305": 3.21086, - "6310": 3.11266, - "6315": 3.10982, - "6320": 3.16047, - "6325": 3.10246, - "6330": 3.16954, - "6335": 3.15391, - "6340": 3.10904, - "6345": 3.16578, - "6350": 3.11808, - "6355": 3.11742, - "6360": 3.1108, - "6365": 3.14775, - "6370": 3.16278, - "6375": 3.1337, - "6380": 3.15125, - "6385": 3.17081, - "6390": 3.12597, - "6395": 3.10466, - "6400": 3.10591, - "6405": 3.18617, - "6410": 3.17298, - "6415": 3.12537, - "6420": 3.17096, - "6425": 3.17458, - "6430": 3.16659, - "6435": 3.12451, - "6440": 3.13606, - "6445": 3.15196, - "6450": 3.09161, - "6455": 3.08666, - "6460": 3.13082, - "6465": 3.16786, - "6470": 3.13951, - "6475": 3.13285, - "6480": 3.15191, - "6485": 3.11206, - "6490": 3.0797, - "6495": 3.16564, - "6500": 3.14177, - "6505": 3.08566, - "6510": 3.14483, - "6515": 3.16369, - "6520": 3.09044, - "6525": 3.14867, - "6530": 3.10896, - "6535": 3.12403, - "6540": 3.18005, - "6545": 3.11404, - "6550": 3.11103, - "6555": 3.10947, - "6560": 3.0737, - "6565": 3.07934, - "6570": 3.10438, - "6575": 3.05844, - "6580": 3.17411, - "6585": 3.10694, - "6590": 3.0877, - "6595": 3.10332, - "6600": 3.1032, - "6605": 3.08625, - "6610": 3.08405, - "6615": 3.1316, - "6620": 3.076, - "6625": 3.09705, - "6630": 3.09309, - "6635": 3.12933, - "6640": 3.08864, - "6645": 3.10948, - "6650": 3.1378, - "6655": 3.07416, - "6660": 3.11313, - "6665": 3.12487, - "6670": 3.08048, - "6675": 3.10457, - "6680": 3.10673, - "6685": 3.14077, - "6690": 3.11651, - "6695": 3.12176, - "6700": 3.1127, - "6705": 3.09107, - "6710": 3.10728, - "6715": 3.05842, - "6720": 3.13504, - "6725": 3.12621, - "6730": 3.1099, - "6735": 3.10898, - "6740": 3.11731, - "6745": 3.0901, - "6750": 3.10983, - "6755": 3.06749, - "6760": 3.06624, - "6765": 3.08509, - "6770": 3.07057, - "6775": 3.10523, - "6780": 3.07455, - "6785": 3.07959, - "6790": 3.10472, - "6795": 3.07166, - "6800": 3.09692, - "6805": 3.08719, - "6810": 3.10858, - "6815": 3.04354, - "6820": 3.07401, - "6825": 3.10257, - "6830": 3.08637, - "6835": 3.06002, - "6840": 3.0654, - "6845": 3.11054, - "6850": 3.08009, - "6855": 3.11065, - "6860": 3.06305, - "6865": 3.10876, - "6870": 3.07538, - "6875": 3.07578, - "6880": 3.08642, - "6885": 3.05135, - "6890": 3.0749, - "6895": 3.05299, - "6900": 3.05973, - "6905": 3.07506, - "6910": 3.09159, - "6915": 3.11333, - "6920": 3.06615, - "6925": 3.08379, - "6930": 3.06742, - "6935": 3.02485, - "6940": 3.06623, - "6945": 3.05639, - "6950": 3.07964, - "6955": 3.05853, - "6960": 3.05554, - "6965": 3.09907, - "6970": 3.03589, - "6975": 3.1075, - "6980": 3.06776, - "6985": 3.06784, - "6990": 3.11146, - "6995": 3.09126, - "7000": 3.02783, - "7005": 3.09757, - "7010": 3.0779, - "7015": 3.07385, - "7020": 3.10018, - "7025": 3.08417, - "7030": 3.08746, - "7035": 3.04096, - "7040": 3.01984, - "7045": 3.07968, - "7050": 3.09817, - "7055": 3.03816, - "7060": 3.09848, - "7065": 3.11109, - "7070": 3.05748, - "7075": 3.06319, - "7080": 3.11208, - "7085": 3.03557, - "7090": 3.05692, - "7095": 3.04652, - "7100": 3.07149, - "7105": 3.02035, - "7110": 3.0623, - "7115": 3.03547, - "7120": 3.07999, - "7125": 3.03377, - "7130": 3.04883, - "7135": 3.05627, - "7140": 3.06014, - "7145": 3.0691, - "7150": 3.02375, - "7155": 3.08612, - "7160": 3.0047, - "7165": 3.0418, - "7170": 3.07701, - "7175": 3.03661, - "7180": 3.07042, - "7185": 3.09125, - "7190": 3.05302, - "7195": 3.06058, - "7200": 3.06039, - "7205": 3.04153, - "7210": 3.08703, - "7215": 3.06723, - "7220": 3.08798, - "7225": 3.06993, - "7230": 3.07403, - "7235": 3.05435, - "7240": 3.05017, - "7245": 3.07131, - "7250": 3.01274, - "7255": 3.03229, - "7260": 3.06928, - "7265": 3.00261, - "7270": 3.04138, - "7275": 3.04223, - "7280": 3.04181, - "7285": 3.05407, - "7290": 3.07344, - "7295": 3.06537, - "7300": 3.02809, - "7305": 3.02877, - "7310": 3.04926, - "7315": 3.07646, - "7320": 3.05669, - "7325": 3.06149, - "7330": 3.02592, - "7335": 3.02733, - "7340": 3.06004, - "7345": 3.0091, - "7350": 3.06031, - "7355": 3.04495, - "7360": 3.03923, - "7365": 3.03845, - "7370": 3.03136, - "7375": 2.9999, - "7380": 3.06202, - "7385": 3.07693, - "7390": 3.06411, - "7395": 3.02221, - "7400": 3.07516, - "7405": 3.04382, - "7410": 3.06023, - "7415": 3.05228, - "7420": 3.03261, - "7425": 3.08586, - "7430": 3.0272, - "7435": 3.01757, - "7440": 3.0377, - "7445": 3.01394, - "7450": 2.99482, - "7455": 3.04735, - "7460": 3.04105, - "7465": 3.04977, - "7470": 3.05673, - "7475": 3.06741, - "7480": 3.02749, - "7485": 2.98653, - "7490": 2.98973, - "7495": 2.99863, - "7500": 3.02945, - "7505": 3.0059, - "7510": 2.97871, - "7515": 3.02404, - "7520": 3.01697, - "7525": 2.98295, - "7530": 3.02636, - "7535": 3.04423, - "7540": 3.02494, - "7545": 3.0588, - "7550": 3.06534, - "7555": 3.00732, - "7560": 3.01283, - "7565": 3.00874, - "7570": 3.03442, - "7575": 2.97962, - "7580": 3.03034, - "7585": 3.01793, - "7590": 3.01504, - "7595": 3.07403, - "7600": 3.03015, - "7605": 3.02144, - "7610": 3.00533, - "7615": 2.99602, - "7620": 2.99265, - "7625": 3.03762, - "7630": 3.02026, - "7635": 3.01854, - "7640": 3.01712, - "7645": 3.04845, - "7650": 3.04439, - "7655": 3.08975, - "7660": 2.96325, - "7665": 3.02969, - "7670": 3.01245, - "7675": 3.00305, - "7680": 2.9998, - "7685": 3.07016, - "7690": 3.01368, - "7695": 2.99671, - "7700": 3.05056, - "7705": 3.01282, - "7710": 3.05828, - "7715": 2.99725, - "7720": 3.08276, - "7725": 2.98411, - "7730": 2.99881, - "7735": 3.02714, - "7740": 3.00979, - "7745": 3.00319, - "7750": 3.01, - "7755": 3.01954, - "7760": 2.98571, - "7765": 3.00397, - "7770": 3.02732, - "7775": 2.98978, - "7780": 2.97862, - "7785": 3.01472, - "7790": 2.99842, - "7795": 3.02413, - "7800": 3.00827, - "7805": 3.01176, - "7810": 3.03082, - "7815": 3.00244, - "7820": 3.0019, - "7825": 3.03231, - "7830": 3.03143, - "7835": 2.96605, - "7840": 3.04336, - "7845": 2.97937, - "7850": 2.93977, - "7855": 2.98529, - "7860": 2.98344, - "7865": 3.02956, - "7870": 2.9691, - "7875": 2.98838, - "7880": 3.00349, - "7885": 2.9968, - "7890": 3.03811, - "7895": 3.02857, - "7900": 3.03097, - "7905": 2.99876, - "7910": 3.0088, - "7915": 3.02527, - "7920": 3.01259, - "7925": 2.99646, - "7930": 3.02866, - "7935": 2.98913, - "7940": 3.03573, - "7945": 3.0501, - "7950": 2.96381, - "7955": 2.98711, - "7960": 2.96943, - "7965": 2.94566, - "7970": 2.9655, - "7975": 2.99544, - "7980": 3.00887, - "7985": 2.97698, - "7990": 2.97506, - "7995": 2.96124, - "8000": 3.02098, - "8005": 2.9801, - "8010": 2.97649, - "8015": 2.96466, - "8020": 2.97779, - "8025": 2.95601, - "8030": 2.97562, - "8035": 2.97196, - "8040": 2.95703, - "8045": 3.01604, - "8050": 3.01297, - "8055": 2.97453, - "8060": 3.00494, - "8065": 2.98862, - "8070": 2.96753, - "8075": 2.97734, - "8080": 3.01019, - "8085": 2.96754, - "8090": 2.98003, - "8095": 3.00216, - "8100": 2.95105, - "8105": 2.99247, - "8110": 2.98157, - "8115": 2.95999, - "8120": 2.97249, - "8125": 2.99946, - "8130": 2.97003, - "8135": 2.98766, - "8140": 2.96736, - "8145": 2.95939, - "8150": 2.98009, - "8155": 2.95146, - "8160": 2.997, - "8165": 2.9913, - "8170": 2.95554, - "8175": 2.95554, - "8180": 3.01376, - "8185": 2.98624, - "8190": 3.02032, - "8195": 2.99613, - "8200": 2.96412, - "8205": 2.97566, - "8210": 2.9781, - "8215": 2.99017, - "8220": 2.971, - "8225": 2.96329, - "8230": 2.99505, - "8235": 3.00306, - "8240": 2.97419, - "8245": 2.9738, - "8250": 3.00958, - "8255": 2.96716, - "8260": 2.97331, - "8265": 2.95555, - "8270": 2.97514, - "8275": 2.96718, - "8280": 2.94092, - "8285": 2.97838, - "8290": 2.96734, - "8295": 2.95246, - "8300": 2.96504, - "8305": 2.97504, - "8310": 2.97996, - "8315": 2.95732, - "8320": 2.97776, - "8325": 2.929, - "8330": 2.89908, - "8335": 2.96646, - "8340": 2.99201, - "8345": 2.94463, - "8350": 2.95886, - "8355": 2.98631, - "8360": 2.96643, - "8365": 2.98326, - "8370": 2.99094, - "8375": 2.93854, - "8380": 2.94099, - "8385": 2.97126, - "8390": 2.9453, - "8395": 2.97523, - "8400": 2.95927, - "8405": 2.97418, - "8410": 3.03057, - "8415": 2.93533, - "8420": 2.91801, - "8425": 2.97564, - "8430": 2.97808, - "8435": 2.93124, - "8440": 3.01239, - "8445": 2.99121, - "8450": 2.96616, - "8455": 2.97106, - "8460": 2.97975, - "8465": 2.92562, - "8470": 2.94697, - "8475": 2.99054, - "8480": 2.93097, - "8485": 2.93977, - "8490": 2.948, - "8495": 2.93336, - "8500": 2.96904, - "8505": 2.92233, - "8510": 3.00332, - "8515": 2.94052, - "8520": 2.95755, - "8525": 2.88522, - "8530": 2.95834, - "8535": 2.97603, - "8540": 2.93194, - "8545": 2.95741, - "8550": 2.92307, - "8555": 2.98961, - "8560": 2.99424, - "8565": 2.9514, - "8570": 2.94707, - "8575": 2.93509, - "8580": 2.9669, - "8585": 2.976, - "8590": 2.97659, - "8595": 2.97731, - "8600": 2.94787, - "8605": 2.94545, - "8610": 2.95479, - "8615": 2.96032, - "8620": 2.92346, - "8625": 2.94581, - "8630": 2.95087, - "8635": 2.94522, - "8640": 2.92578, - "8645": 2.98133, - "8650": 2.92232, - "8655": 2.96592, - "8660": 2.97073, - "8665": 2.95471, - "8670": 2.96657, - "8675": 2.93996, - "8680": 2.93576, - "8685": 2.94815, - "8690": 2.96442, - "8695": 2.97067, - "8700": 2.94799, - "8705": 2.91745, - "8710": 2.96979, - "8715": 2.91522, - "8720": 2.97447, - "8725": 2.94876, - "8730": 2.94256, - "8735": 2.97158, - "8740": 2.92587, - "8745": 2.96492, - "8750": 2.96628, - "8755": 2.93098, - "8760": 2.94924, - "8765": 2.91354, - "8770": 2.96822, - "8775": 2.94219, - "8780": 2.92859, - "8785": 2.94726, - "8790": 2.92803, - "8795": 2.96489, - "8800": 2.92662, - "8805": 2.90115, - "8810": 2.93145, - "8815": 2.93283, - "8820": 2.90387, - "8825": 2.92443, - "8830": 2.91245, - "8835": 2.89847, - "8840": 2.91518, - "8845": 2.92785, - "8850": 2.95695, - "8855": 2.92839, - "8860": 2.98878, - "8865": 2.93356, - "8870": 2.90865, - "8875": 2.92162, - "8880": 2.9295, - "8885": 2.9207, - "8890": 2.9404, - "8895": 2.92179, - "8900": 2.94464, - "8905": 2.93594, - "8910": 2.91993, - "8915": 2.90336, - "8920": 2.91127, - "8925": 2.97428, - "8930": 2.96209, - "8935": 2.97189, - "8940": 2.94882, - "8945": 2.94789, - "8950": 2.9328, - "8955": 2.91679, - "8960": 2.89858, - "8965": 2.92721, - "8970": 2.94082, - "8975": 2.90449, - "8980": 2.89797, - "8985": 2.92102, - "8990": 2.9662, - "8995": 2.9373, - "9000": 2.89467, - "9005": 2.9399, - "9010": 2.97901, - "9015": 2.90311, - "9020": 2.90423, - "9025": 2.92238, - "9030": 2.94518, - "9035": 2.85736, - "9040": 2.93491, - "9045": 2.92378, - "9050": 2.96087, - "9055": 2.88884, - "9060": 2.95609, - "9065": 2.98682, - "9070": 2.92665, - "9075": 2.94254, - "9080": 2.93301, - "9085": 2.9439, - "9090": 2.93648, - "9095": 2.89849, - "9100": 2.90017, - "9105": 2.89, - "9110": 2.93211, - "9115": 2.93981, - "9120": 2.97397, - "9125": 2.91648, - "9130": 2.92277, - "9135": 2.94086, - "9140": 2.94695, - "9145": 2.89447, - "9150": 2.92217, - "9155": 2.93169, - "9160": 2.93686, - "9165": 2.92557, - "9170": 2.9498, - "9175": 2.88716, - "9180": 2.93307, - "9185": 2.8947, - "9190": 2.94894, - "9195": 2.91222, - "9200": 2.93251, - "9205": 2.88702, - "9210": 2.93304, - "9215": 2.87965, - "9220": 2.90288, - "9225": 2.93315, - "9230": 2.86569, - "9235": 2.87842, - "9240": 2.89576, - "9245": 2.88279, - "9250": 2.88136, - "9255": 2.91192, - "9260": 2.87817, - "9265": 2.92175, - "9270": 2.89613, - "9275": 2.91313, - "9280": 2.91939, - "9285": 2.91903, - "9290": 2.93047, - "9295": 2.92844, - "9300": 2.87877, - "9305": 2.90909, - "9310": 2.89871, - "9315": 2.86609, - "9320": 2.86065, - "9325": 2.90436, - "9330": 2.95511, - "9335": 2.87572, - "9340": 2.93845, - "9345": 2.94693, - "9350": 2.9134, - "9355": 2.87737, - "9360": 2.89674, - "9365": 2.8823, - "9370": 2.93386, - "9375": 2.91236, - "9380": 2.86428, - "9385": 2.91358, - "9390": 2.92324, - "9395": 2.92024, - "9400": 2.89599, - "9405": 2.89197, - "9410": 2.9185, - "9415": 2.91775, - "9420": 2.89381, - "9425": 2.89983, - "9430": 2.87833, - "9435": 2.90417, - "9440": 2.89629, - "9445": 2.88366, - "9450": 2.89069, - "9455": 2.88969, - "9460": 2.94442, - "9465": 2.94721, - "9470": 2.88553, - "9475": 2.94033, - "9480": 2.88982, - "9485": 2.87815, - "9490": 2.89723, - "9495": 2.9225, - "9500": 2.89514, - "9505": 2.86794, - "9510": 2.894, - "9515": 2.90369, - "9520": 2.91102, - "9525": 2.89095, - "9530": 2.88696, - "9535": 2.91216 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 1021640256.0, - "5": 1024063424.0, - "10": 1014250560.0, - "15": 1024077504.0, - "20": 1022486144.0, - "25": 1041373312.0, - "30": 1028112896.0, - "35": 1035625088.0, - "40": 1026328384.0, - "45": 1022350080.0, - "50": 1030098560.0, - "55": 1028966144.0, - "60": 1036320640.0, - "65": 1034679168.0, - "70": 1029374848.0, - "75": 1028745088.0, - "80": 1047575040.0, - "85": 1029448064.0, - "90": 1020467392.0, - "95": 1028310016.0, - "100": 1040961344.0, - "105": 1039436544.0, - "110": 1026879104.0, - "115": 1052312832.0, - "120": 1018863104.0, - "125": 1045372160.0, - "130": 1034330368.0, - "135": 1016615680.0, - "140": 1038582272.0, - "145": 1020688640.0, - "150": 1039788096.0, - "155": 1032796928.0, - "160": 1020952640.0, - "165": 1032424512.0, - "170": 1017396096.0, - "175": 1033427072.0, - "180": 1036119424.0, - "185": 1030573760.0, - "190": 1035673984.0, - "195": 1034555520.0, - "200": 1040973824.0, - "205": 1048500352.0, - "210": 1054481024.0, - "215": 1025159552.0, - "220": 1044962496.0, - "225": 1038076416.0, - "230": 1026222720.0, - "235": 1051134976.0, - "240": 1029276416.0, - "245": 1031397824.0, - "250": 1027879616.0, - "255": 1016929792.0, - "260": 1045008896.0, - "265": 1021330688.0, - "270": 1030964864.0, - "275": 1036911744.0, - "280": 1031743488.0, - "285": 1015014016.0, - "290": 1018756352.0, - "295": 1017237504.0, - "300": 1034761152.0, - "305": 1032166144.0, - "310": 1035583104.0, - "315": 1012734272.0, - "320": 1008275072.0, - "325": 1042741760.0, - "330": 1042870656.0, - "335": 1033508480.0, - "340": 1014464512.0, - "345": 1042618880.0, - "350": 1031852736.0, - "355": 1050844800.0, - "360": 1030258432.0, - "365": 1034595648.0, - "370": 1019436032.0, - "375": 1022144832.0, - "380": 1021326592.0, - "385": 1025589504.0, - "390": 1023195072.0, - "395": 1019653952.0, - "400": 1033520512.0, - "405": 1023880192.0, - "410": 1017910016.0, - "415": 1024288000.0, - "420": 1020624256.0, - "425": 1025854848.0, - "430": 1033854336.0, - "435": 1028182400.0, - "440": 1022090752.0, - "445": 1036768256.0, - "450": 1024997376.0, - "455": 1013852096.0, - "460": 1022093824.0, - "465": 1041431552.0, - "470": 1029038016.0, - "475": 1010065792.0, - "480": 1047607616.0, - "485": 1029724928.0, - "490": 1044668160.0, - "495": 1025229952.0, - "500": 1037464960.0, - "505": 1032181376.0, - "510": 1042853056.0, - "515": 1026159744.0, - "520": 1013409792.0, - "525": 1035147520.0, - "530": 1016375552.0, - "535": 1040113024.0, - "540": 1035052352.0, - "545": 1032113664.0, - "550": 1018673408.0, - "555": 1008638656.0, - "560": 1011927680.0, - "565": 1041824320.0, - "570": 1034942208.0, - "575": 1010199040.0, - "580": 1032210496.0, - "585": 1041262144.0, - "590": 1038867968.0, - "595": 1035743104.0, - "600": 1023772736.0, - "605": 1032294272.0, - "610": 1037748672.0, - "615": 1005974784.0, - "620": 1040407424.0, - "625": 1045209216.0, - "630": 1034414464.0, - "635": 1028523008.0, - "640": 1022644928.0, - "645": 1035876032.0, - "650": 1009255680.0, - "655": 997757696.0, - "660": 1029710464.0, - "665": 1025532608.0, - "670": 1048812288.0, - "675": 1025202688.0, - "680": 1019340032.0, - "685": 1027832512.0, - "690": 1029230080.0, - "695": 1040024576.0, - "700": 1042031680.0, - "705": 1034382976.0, - "710": 1020441792.0, - "715": 1031472128.0, - "720": 1040274560.0, - "725": 1023279936.0, - "730": 1022792704.0, - "735": 1025085696.0, - "740": 1038382656.0, - "745": 1045205504.0, - "750": 1013180928.0, - "755": 1031644032.0, - "760": 1032783552.0, - "765": 1027135936.0, - "770": 1023967232.0, - "775": 1025895168.0, - "780": 1038166464.0, - "785": 1025486400.0, - "790": 1040810624.0, - "795": 1032531200.0, - "800": 1039592768.0, - "805": 1024318016.0, - "810": 1034725632.0, - "815": 1036000448.0, - "820": 1035671552.0, - "825": 1051375360.0, - "830": 1035406784.0, - "835": 1022547776.0, - "840": 1036875648.0, - "845": 1025700352.0, - "850": 1048529920.0, - "855": 1014986432.0, - "860": 1033098624.0, - "865": 1031543040.0, - "870": 1040902912.0, - "875": 1023938304.0, - "880": 1028395904.0, - "885": 1054406656.0, - "890": 1019537152.0, - "895": 1045189824.0, - "900": 1031772928.0, - "905": 1020970688.0, - "910": 1031386112.0, - "915": 1032926912.0, - "920": 1038459392.0, - "925": 1026754560.0, - "930": 1025378752.0, - "935": 1031126464.0, - "940": 1057933568.0, - "945": 1029823104.0, - "950": 1014412480.0, - "955": 1032173696.0, - "960": 1026152064.0, - "965": 1062678976.0, - "970": 1030096128.0, - "975": 1036903680.0, - "980": 1027049216.0, - "985": 1030676736.0, - "990": 1020676864.0, - "995": 1042301760.0, - "1000": 1036831616.0, - "1005": 1050206080.0, - "1010": 1023801984.0, - "1015": 1020539008.0, - "1020": 1042587392.0, - "1025": 1037943808.0, - "1030": 1049210048.0, - "1035": 1012483456.0, - "1040": 1023092032.0, - "1045": 1039520768.0, - "1050": 1026825728.0, - "1055": 1034861184.0, - "1060": 1046128704.0, - "1065": 1036804096.0, - "1070": 1019994880.0, - "1075": 1025341696.0, - "1080": 1014979200.0, - "1085": 1030007744.0, - "1090": 1029062016.0, - "1095": 1020309888.0, - "1100": 1039835008.0, - "1105": 1048600064.0, - "1110": 1020704448.0, - "1115": 1024782720.0, - "1120": 1061896576.0, - "1125": 1043311616.0, - "1130": 1031219456.0, - "1135": 1041360512.0, - "1140": 1021486272.0, - "1145": 1051696128.0, - "1150": 1035590400.0, - "1155": 1029590528.0, - "1160": 1042564800.0, - "1165": 1026810496.0, - "1170": 1018001408.0, - "1175": 1033684032.0, - "1180": 1035633536.0, - "1185": 1023928960.0, - "1190": 1033160320.0, - "1195": 1024228608.0, - "1200": 1039116544.0, - "1205": 1031740800.0, - "1210": 1053250560.0, - "1215": 1024617600.0, - "1220": 1009041280.0, - "1225": 1036679680.0, - "1230": 1041257984.0, - "1235": 1053974912.0, - "1240": 1030356224.0, - "1245": 1017684864.0, - "1250": 1022772992.0, - "1255": 1033439104.0, - "1260": 1034284736.0, - "1265": 1034003840.0, - "1270": 1037323264.0, - "1275": 1029345792.0, - "1280": 1046489856.0, - "1285": 1028285120.0, - "1290": 1036578176.0, - "1295": 1032421696.0, - "1300": 1033065728.0, - "1305": 1030027008.0, - "1310": 1051262976.0, - "1315": 1035373184.0, - "1320": 1028263936.0, - "1325": 1049972736.0, - "1330": 1030133376.0, - "1335": 1031164800.0, - "1340": 1012758912.0, - "1345": 1044639232.0, - "1350": 1034957312.0, - "1355": 1033623744.0, - "1360": 1036683392.0, - "1365": 1038588672.0, - "1370": 1039851904.0, - "1375": 1034117632.0, - "1380": 1022886656.0, - "1385": 1018084096.0, - "1390": 1049054400.0, - "1395": 1034868352.0, - "1400": 1034998144.0, - "1405": 1034131456.0, - "1410": 1036368256.0, - "1415": 1043577600.0, - "1420": 1026111104.0, - "1425": 1033320320.0, - "1430": 1012808128.0, - "1435": 1038394880.0, - "1440": 1020971904.0, - "1445": 1032459904.0, - "1450": 1014039296.0, - "1455": 1011673984.0, - "1460": 1043275904.0, - "1465": 1014361600.0, - "1470": 1020655360.0, - "1475": 1030231296.0, - "1480": 1029370496.0, - "1485": 1022997696.0, - "1490": 1026783360.0, - "1495": 1021815744.0, - "1500": 1027177088.0, - "1505": 1034882880.0, - "1510": 1014397120.0, - "1515": 1042136832.0, - "1520": 1025792640.0, - "1525": 1036335872.0, - "1530": 1039948992.0, - "1535": 1047640192.0, - "1540": 1043539840.0, - "1545": 1034043520.0, - "1550": 1016108736.0, - "1555": 1015573504.0, - "1560": 1055021824.0, - "1565": 1015593728.0, - "1570": 1018243840.0, - "1575": 1032515456.0, - "1580": 1012984768.0, - "1585": 1025327680.0, - "1590": 1034127360.0, - "1595": 1057393664.0, - "1600": 1026867584.0, - "1605": 1019994624.0, - "1610": 1031268736.0, - "1615": 1035274880.0, - "1620": 1018016000.0, - "1625": 1028272512.0, - "1630": 1027205376.0, - "1635": 1023799040.0, - "1640": 1034120832.0, - "1645": 1021814528.0, - "1650": 1015262080.0, - "1655": 1018280064.0, - "1660": 1047982976.0, - "1665": 1027060352.0, - "1670": 1048219904.0, - "1675": 1021102912.0, - "1680": 1043288320.0, - "1685": 1052719360.0, - "1690": 1026724032.0, - "1695": 1040385280.0, - "1700": 1018036352.0, - "1705": 1020480640.0, - "1710": 1021024448.0, - "1715": 1026932992.0, - "1720": 1028350208.0, - "1725": 1034363136.0, - "1730": 1013692352.0, - "1735": 1018429696.0, - "1740": 1057257024.0, - "1745": 1029261952.0, - "1750": 1024357888.0, - "1755": 1029970112.0, - "1760": 1022192512.0, - "1765": 1040477056.0, - "1770": 1029669760.0, - "1775": 1046196864.0, - "1780": 1021955712.0, - "1785": 1035109376.0, - "1790": 1028263808.0, - "1795": 1031023616.0, - "1800": 1028300480.0, - "1805": 1025669248.0, - "1810": 1021556096.0, - "1815": 1033440256.0, - "1820": 1034885888.0, - "1825": 1020208448.0, - "1830": 1013885632.0, - "1835": 1031382272.0, - "1840": 1040391040.0, - "1845": 1034828800.0, - "1850": 1014480064.0, - "1855": 1019418816.0, - "1860": 1019569536.0, - "1865": 1035942400.0, - "1870": 1026242368.0, - "1875": 1031525248.0, - "1880": 1011590784.0, - "1885": 1041065536.0, - "1890": 1035000704.0, - "1895": 1028959488.0, - "1900": 1033997568.0, - "1905": 1027123776.0, - "1910": 1029217792.0, - "1915": 1030492864.0, - "1920": 1042920384.0, - "1925": 1038419392.0, - "1930": 1019304512.0, - "1935": 1032535936.0, - "1940": 1027806336.0, - "1945": 1034205056.0, - "1950": 1006036224.0, - "1955": 1032577600.0, - "1960": 1015720256.0, - "1965": 1029088512.0, - "1970": 1021554176.0, - "1975": 1034048000.0, - "1980": 1029366912.0, - "1985": 1027784960.0, - "1990": 1020947840.0, - "1995": 1010422912.0, - "2000": 1039617152.0, - "2005": 1001486208.0, - "2010": 1020422912.0, - "2015": 1032034048.0, - "2020": 1036298624.0, - "2025": 1037172352.0, - "2030": 1029770752.0, - "2035": 1040333312.0, - "2040": 1030112768.0, - "2045": 1032700800.0, - "2050": 1008016064.0, - "2055": 1045723840.0, - "2060": 1028142400.0, - "2065": 1038799488.0, - "2070": 1045645184.0, - "2075": 1035237952.0, - "2080": 1022882304.0, - "2085": 1024815424.0, - "2090": 1034363392.0, - "2095": 1005220672.0, - "2100": 1034644096.0, - "2105": 1035581312.0, - "2110": 1030685952.0, - "2115": 1029798528.0, - "2120": 1018846080.0, - "2125": 1021863168.0, - "2130": 1026638080.0, - "2135": 1053279488.0, - "2140": 1017060608.0, - "2145": 1019635072.0, - "2150": 1037130752.0, - "2155": 1033302784.0, - "2160": 1049035776.0, - "2165": 1039682816.0, - "2170": 1020308096.0, - "2175": 1027338752.0, - "2180": 1041703168.0, - "2185": 1028895360.0, - "2190": 1029309888.0, - "2195": 1028944768.0, - "2200": 1039639680.0, - "2205": 1036972288.0, - "2210": 1031740544.0, - "2215": 1021404480.0, - "2220": 1020910848.0, - "2225": 1033403072.0, - "2230": 1014201856.0, - "2235": 1029395968.0, - "2240": 1029885184.0, - "2245": 1026005824.0, - "2250": 1046268800.0, - "2255": 1032951936.0, - "2260": 1047494592.0, - "2265": 1023721088.0, - "2270": 1022566144.0, - "2275": 1028537600.0, - "2280": 1034973568.0, - "2285": 1031819968.0, - "2290": 1038650048.0, - "2295": 1028816000.0, - "2300": 1034450496.0, - "2305": 1032314496.0, - "2310": 1013586496.0, - "2315": 1048182656.0, - "2320": 1035210368.0, - "2325": 1046966016.0, - "2330": 1014696192.0, - "2335": 1027382272.0, - "2340": 1036736512.0, - "2345": 1020186944.0, - "2350": 1031017728.0, - "2355": 1037474240.0, - "2360": 1032608128.0, - "2365": 1028041856.0, - "2370": 1021004224.0, - "2375": 1022912000.0, - "2380": 1048556224.0, - "2385": 1044140736.0, - "2390": 1021986816.0, - "2395": 1020595584.0, - "2400": 1026930816.0, - "2405": 1038387200.0, - "2410": 1045395200.0, - "2415": 1048454656.0, - "2420": 1032227712.0, - "2425": 1029562176.0, - "2430": 1030386176.0, - "2435": 1029217856.0, - "2440": 1029168000.0, - "2445": 1033132160.0, - "2450": 1038557824.0, - "2455": 1034721536.0, - "2460": 1039984192.0, - "2465": 1032500992.0, - "2470": 1024143872.0, - "2475": 1016539520.0, - "2480": 1023613248.0, - "2485": 1021030592.0, - "2490": 1035920448.0, - "2495": 1032967360.0, - "2500": 1028107008.0, - "2505": 1015385600.0, - "2510": 1030967104.0, - "2515": 1025700096.0, - "2520": 1033326208.0, - "2525": 1029692800.0, - "2530": 1023986560.0, - "2535": 1071069696.0, - "2540": 1024537984.0, - "2545": 1033798784.0, - "2550": 1029448064.0, - "2555": 1029183488.0, - "2560": 1018115072.0, - "2565": 1031598528.0, - "2570": 1022847232.0, - "2575": 1026503104.0, - "2580": 1038622592.0, - "2585": 1025899456.0, - "2590": 1026100800.0, - "2595": 1046623104.0, - "2600": 1031103360.0, - "2605": 1001910656.0, - "2610": 1028423360.0, - "2615": 1025564544.0, - "2620": 1038651392.0, - "2625": 1026996352.0, - "2630": 1036831424.0, - "2635": 1021198400.0, - "2640": 1021865856.0, - "2645": 1039153408.0, - "2650": 1025943488.0, - "2655": 1013255808.0, - "2660": 1032645248.0, - "2665": 1035218048.0, - "2670": 1036437632.0, - "2675": 1039296064.0, - "2680": 1041661696.0, - "2685": 1034565504.0, - "2690": 1058871168.0, - "2695": 1019879552.0, - "2700": 1062626816.0, - "2705": 1035376320.0, - "2710": 1019542400.0, - "2715": 1031885824.0, - "2720": 1016403200.0, - "2725": 1040594688.0, - "2730": 1019586688.0, - "2735": 1030889856.0, - "2740": 1029290752.0, - "2745": 1040687744.0, - "2750": 1023880448.0, - "2755": 1011865664.0, - "2760": 1027684864.0, - "2765": 1030882240.0, - "2770": 1033119872.0, - "2775": 1026332352.0, - "2780": 1033684224.0, - "2785": 1024589888.0, - "2790": 1033734272.0, - "2795": 1045949184.0, - "2800": 1040286016.0, - "2805": 1019944192.0, - "2810": 1031449600.0, - "2815": 1030932736.0, - "2820": 1037855616.0, - "2825": 1041684096.0, - "2830": 1030459904.0, - "2835": 1013508352.0, - "2840": 1031449600.0, - "2845": 1030129920.0, - "2850": 1026617600.0, - "2855": 1024705280.0, - "2860": 1031700096.0, - "2865": 1027428800.0, - "2870": 1026690048.0, - "2875": 1012777024.0, - "2880": 1038301568.0, - "2885": 1017901184.0, - "2890": 1044200064.0, - "2895": 1036459136.0, - "2900": 1030652928.0, - "2905": 1035957376.0, - "2910": 1038718272.0, - "2915": 1039385408.0, - "2920": 1034781248.0, - "2925": 1043267840.0, - "2930": 1038229696.0, - "2935": 1021222144.0, - "2940": 1042307456.0, - "2945": 1045232384.0, - "2950": 1047525952.0, - "2955": 1034172928.0, - "2960": 1020891904.0, - "2965": 1027307840.0, - "2970": 1038796288.0, - "2975": 1034007296.0, - "2980": 1049590400.0, - "2985": 1034846016.0, - "2990": 1026008576.0, - "2995": 1034919296.0, - "3000": 1039017856.0, - "3005": 1038158848.0, - "3010": 1010907712.0, - "3015": 1044976064.0, - "3020": 1034050688.0, - "3025": 1037763840.0, - "3030": 1027722816.0, - "3035": 1041821056.0, - "3040": 1035311872.0, - "3045": 1027255296.0, - "3050": 1029708032.0, - "3055": 1028029568.0, - "3060": 1049976960.0, - "3065": 1024067200.0, - "3070": 1011545728.0, - "3075": 1042846272.0, - "3080": 1036094912.0, - "3085": 1030387456.0, - "3090": 1035262976.0, - "3095": 1013803008.0, - "3100": 1030144896.0, - "3105": 1017609088.0, - "3110": 1033370816.0, - "3115": 1023737728.0, - "3120": 1024877504.0, - "3125": 1046537216.0, - "3130": 1024676160.0, - "3135": 1025722496.0, - "3140": 1043778176.0, - "3145": 1044372672.0, - "3150": 1016483328.0, - "3155": 1042487936.0, - "3160": 1026834688.0, - "3165": 1031199360.0, - "3170": 1024332800.0, - "3175": 1024368640.0, - "3180": 1018204288.0, - "3185": 1034352512.0, - "3190": 1019221888.0, - "3195": 1028425408.0, - "3200": 1036080640.0, - "3205": 1016076160.0, - "3210": 1034109312.0, - "3215": 1031349312.0, - "3220": 1040833664.0, - "3225": 1022835008.0, - "3230": 1033255744.0, - "3235": 1019975488.0, - "3240": 1038131840.0, - "3245": 1031643136.0, - "3250": 1022390656.0, - "3255": 1032876672.0, - "3260": 1037751616.0, - "3265": 1021622656.0, - "3270": 1031242880.0, - "3275": 1038461184.0, - "3280": 1023236992.0, - "3285": 1031615424.0, - "3290": 1045247616.0, - "3295": 1043177536.0, - "3300": 1035084224.0, - "3305": 1042662400.0, - "3310": 1058092096.0, - "3315": 1024282880.0, - "3320": 1046015296.0, - "3325": 1023179008.0, - "3330": 1048037248.0, - "3335": 1036690560.0, - "3340": 1042123392.0, - "3345": 1030897920.0, - "3350": 1020621696.0, - "3355": 1025960576.0, - "3360": 1030305344.0, - "3365": 1031171520.0, - "3370": 1036454144.0, - "3375": 1023472384.0, - "3380": 1032383744.0, - "3385": 1038081536.0, - "3390": 1052811072.0, - "3395": 1012090496.0, - "3400": 1019209600.0, - "3405": 1021780224.0, - "3410": 1028433728.0, - "3415": 1058222400.0, - "3420": 1033492480.0, - "3425": 1029580352.0, - "3430": 1021150976.0, - "3435": 1034991872.0, - "3440": 1017961600.0, - "3445": 1025537280.0, - "3450": 1032254336.0, - "3455": 1036261312.0, - "3460": 1052071808.0, - "3465": 1027114240.0, - "3470": 1043729536.0, - "3475": 1033265792.0, - "3480": 1026619776.0, - "3485": 1029215232.0, - "3490": 1041041408.0, - "3495": 1019252224.0, - "3500": 1032059904.0, - "3505": 1025753728.0, - "3510": 1044367616.0, - "3515": 1013817280.0, - "3520": 1021846400.0, - "3525": 1032175552.0, - "3530": 1029789056.0, - "3535": 1034568704.0, - "3540": 1017731456.0, - "3545": 1035658880.0, - "3550": 1024535296.0, - "3555": 1035866112.0, - "3560": 1029737600.0, - "3565": 1028900160.0, - "3570": 1046029888.0, - "3575": 1039186304.0, - "3580": 1010838336.0, - "3585": 1031737728.0, - "3590": 1041450688.0, - "3595": 1037636800.0, - "3600": 1032763584.0, - "3605": 1045822272.0, - "3610": 1039235200.0, - "3615": 1036870144.0, - "3620": 1026929664.0, - "3625": 1033931136.0, - "3630": 1017582464.0, - "3635": 1026629056.0, - "3640": 1039529088.0, - "3645": 1022655872.0, - "3650": 1036842624.0, - "3655": 1023990144.0, - "3660": 1014987456.0, - "3665": 1026118784.0, - "3670": 1041672448.0, - "3675": 1033250304.0, - "3680": 1015353984.0, - "3685": 1029122304.0, - "3690": 1026204416.0, - "3695": 1043800832.0, - "3700": 1028613504.0, - "3705": 1049485312.0, - "3710": 1027180672.0, - "3715": 1016134912.0, - "3720": 1040818560.0, - "3725": 1032763776.0, - "3730": 1030920960.0, - "3735": 1019008640.0, - "3740": 1023825600.0, - "3745": 1046289152.0, - "3750": 1034462336.0, - "3755": 1032090048.0, - "3760": 1019366912.0, - "3765": 1031916736.0, - "3770": 1026677120.0, - "3775": 1035708288.0, - "3780": 1030671104.0, - "3785": 1027208128.0, - "3790": 1019584064.0, - "3795": 1030306048.0, - "3800": 1035614976.0, - "3805": 1035423360.0, - "3810": 1033294144.0, - "3815": 1033988608.0, - "3820": 1041105792.0, - "3825": 1024534976.0, - "3830": 1037630528.0, - "3835": 1040347968.0, - "3840": 1023445888.0, - "3845": 1048466688.0, - "3850": 1052489280.0, - "3855": 1028907264.0, - "3860": 1019532672.0, - "3865": 1035487744.0, - "3870": 1028491712.0, - "3875": 1041164800.0, - "3880": 1048854912.0, - "3885": 1027725248.0, - "3890": 1027487616.0, - "3895": 1034190592.0, - "3900": 1027645312.0, - "3905": 1027976128.0, - "3910": 1041572480.0, - "3915": 1043995392.0, - "3920": 1041063424.0, - "3925": 1030836160.0, - "3930": 1027072896.0, - "3935": 1033782016.0, - "3940": 1042275712.0, - "3945": 1036248064.0, - "3950": 1021430976.0, - "3955": 1036304128.0, - "3960": 1024184192.0, - "3965": 1027065856.0, - "3970": 1015984640.0, - "3975": 1041421632.0, - "3980": 1032455488.0, - "3985": 1037680640.0, - "3990": 1038684992.0, - "3995": 1023654528.0, - "4000": 1054410240.0, - "4005": 1029983424.0, - "4010": 1025138112.0, - "4015": 1030978560.0, - "4020": 1018472448.0, - "4025": 1027124352.0, - "4030": 1010306816.0, - "4035": 1038641088.0, - "4040": 1022256640.0, - "4045": 1025038208.0, - "4050": 1032348800.0, - "4055": 1022420864.0, - "4060": 1024520768.0, - "4065": 1032871168.0, - "4070": 1027791232.0, - "4075": 1025596928.0, - "4080": 1029366656.0, - "4085": 1020823552.0, - "4090": 1033322496.0, - "4095": 1024142656.0, - "4100": 1040948864.0, - "4105": 1027266496.0, - "4110": 1038791424.0, - "4115": 1023497088.0, - "4120": 1038943168.0, - "4125": 1048274176.0, - "4130": 1021490752.0, - "4135": 1034570880.0, - "4140": 1034613824.0, - "4145": 1044447232.0, - "4150": 1000353664.0, - "4155": 1028363392.0, - "4160": 1024242624.0, - "4165": 1033688704.0, - "4170": 1018888000.0, - "4175": 1026492608.0, - "4180": 1045409024.0, - "4185": 1033631616.0, - "4190": 1029574592.0, - "4195": 1038777984.0, - "4200": 1025102336.0, - "4205": 1019074816.0, - "4210": 1029560704.0, - "4215": 1032269184.0, - "4220": 1026242048.0, - "4225": 1031925888.0, - "4230": 1030269824.0, - "4235": 1027603328.0, - "4240": 1031480832.0, - "4245": 1028765056.0, - "4250": 1026987008.0, - "4255": 1021240064.0, - "4260": 1042082432.0, - "4265": 1025411200.0, - "4270": 1030169984.0, - "4275": 1012472448.0, - "4280": 1044505600.0, - "4285": 1019898304.0, - "4290": 1033058560.0, - "4295": 1033596032.0, - "4300": 1031638912.0, - "4305": 1023847936.0, - "4310": 1021568512.0, - "4315": 1047221504.0, - "4320": 1026520576.0, - "4325": 1005865600.0, - "4330": 1037666688.0, - "4335": 1022006464.0, - "4340": 1029009920.0, - "4345": 1033474496.0, - "4350": 1036886144.0, - "4355": 1026808832.0, - "4360": 1022938240.0, - "4365": 1028779648.0, - "4370": 1029624704.0, - "4375": 1042196864.0, - "4380": 1016100096.0, - "4385": 1045551296.0, - "4390": 1026270848.0, - "4395": 1029796416.0, - "4400": 1047365760.0, - "4405": 1029297344.0, - "4410": 1033424256.0, - "4415": 1028298304.0, - "4420": 1028148928.0, - "4425": 1033575552.0, - "4430": 1031374592.0, - "4435": 1028571136.0, - "4440": 1033123328.0, - "4445": 1028293504.0, - "4450": 1052210944.0, - "4455": 1026286080.0, - "4460": 1034885888.0, - "4465": 1031725696.0, - "4470": 1035446528.0, - "4475": 1036971712.0, - "4480": 1025117824.0, - "4485": 1034104960.0, - "4490": 1024630912.0, - "4495": 1047974912.0, - "4500": 1024707840.0, - "4505": 1038850048.0, - "4510": 1043723776.0, - "4515": 1044276736.0, - "4520": 1036872320.0, - "4525": 1058073536.0, - "4530": 1030973568.0, - "4535": 1032592256.0, - "4540": 1036428160.0, - "4545": 1025726400.0, - "4550": 1021749312.0, - "4555": 1037546112.0, - "4560": 1020099200.0, - "4565": 1036055296.0, - "4570": 1020501120.0, - "4575": 1050412608.0, - "4580": 1010437888.0, - "4585": 1022960768.0, - "4590": 1039710272.0, - "4595": 1023274880.0, - "4600": 1042477824.0, - "4605": 1039746688.0, - "4610": 1046104192.0, - "4615": 1017999744.0, - "4620": 1044734592.0, - "4625": 1030479104.0, - "4630": 1027260800.0, - "4635": 1026995200.0, - "4640": 1034901248.0, - "4645": 1036420352.0, - "4650": 1033711488.0, - "4655": 1035461056.0, - "4660": 1035324800.0, - "4665": 1020265664.0, - "4670": 1020057344.0, - "4675": 1054848768.0, - "4680": 1024895872.0, - "4685": 1027820160.0, - "4690": 1034449664.0, - "4695": 1039151744.0, - "4700": 1038865024.0, - "4705": 1027655808.0, - "4710": 1020522560.0, - "4715": 1031825536.0, - "4720": 1030300416.0, - "4725": 1030298368.0, - "4730": 1044096704.0, - "4735": 1046133376.0, - "4740": 1036178112.0, - "4745": 1039043840.0, - "4750": 1031790528.0, - "4755": 1047723392.0, - "4760": 1026178176.0, - "4765": 1034695040.0, - "4770": 1036521856.0, - "4775": 1029375168.0, - "4780": 1028543488.0, - "4785": 1028414976.0, - "4790": 1019620224.0, - "4795": 1033060160.0, - "4800": 1051866880.0, - "4805": 1015414400.0, - "4810": 1029454336.0, - "4815": 1009572096.0, - "4820": 1041051200.0, - "4825": 1026708608.0, - "4830": 1020450816.0, - "4835": 1051307840.0, - "4840": 1019456512.0, - "4845": 1032315008.0, - "4850": 1036794496.0, - "4855": 1031052736.0, - "4860": 1033131776.0, - "4865": 1032064384.0, - "4870": 1049832576.0, - "4875": 1025110528.0, - "4880": 1048476160.0, - "4885": 1016853056.0, - "4890": 1037317312.0, - "4895": 1024323136.0, - "4900": 1043374208.0, - "4905": 1033397120.0, - "4910": 1032830272.0, - "4915": 1016889856.0, - "4920": 1022294784.0, - "4925": 1034965888.0, - "4930": 1034630016.0, - "4935": 1025885312.0, - "4940": 1048398272.0, - "4945": 1025248576.0, - "4950": 1024208768.0, - "4955": 1007485952.0, - "4960": 1040213824.0, - "4965": 1018775296.0, - "4970": 1014274688.0, - "4975": 1038025472.0, - "4980": 1020917888.0, - "4985": 1029045888.0, - "4990": 1028394816.0, - "4995": 1032020480.0, - "5000": 1039791104.0, - "5005": 1024351552.0, - "5010": 1029147968.0, - "5015": 1021807296.0, - "5020": 1023506944.0, - "5025": 1037603456.0, - "5030": 1041947136.0, - "5035": 1047130304.0, - "5040": 1060956096.0, - "5045": 1032108544.0, - "5050": 1029534336.0, - "5055": 1024552192.0, - "5060": 1035282304.0, - "5065": 1021205504.0, - "5070": 1035756288.0, - "5075": 1015771264.0, - "5080": 1027040064.0, - "5085": 1021792192.0, - "5090": 1034973568.0, - "5095": 1015499712.0, - "5100": 1032257600.0, - "5105": 1017981568.0, - "5110": 1019586304.0, - "5115": 1036063936.0, - "5120": 1032695040.0, - "5125": 1019076992.0, - "5130": 1033404672.0, - "5135": 1041203072.0, - "5140": 1026258752.0, - "5145": 1033705856.0, - "5150": 1022043520.0, - "5155": 1032265664.0, - "5160": 1039625984.0, - "5165": 1031576448.0, - "5170": 1035555328.0, - "5175": 1026116224.0, - "5180": 1030316032.0, - "5185": 1024495680.0, - "5190": 1019492608.0, - "5195": 1035626496.0, - "5200": 1016905344.0, - "5205": 1013435648.0, - "5210": 1049395456.0, - "5215": 1030833280.0, - "5220": 1025276800.0, - "5225": 1035239936.0, - "5230": 1025930624.0, - "5235": 1025120000.0, - "5240": 1046308224.0, - "5245": 1022740608.0, - "5250": 1027062336.0, - "5255": 1023887360.0, - "5260": 1033821440.0, - "5265": 1045733696.0, - "5270": 1052500480.0, - "5275": 1033018112.0, - "5280": 1030073920.0, - "5285": 1025212608.0, - "5290": 1026575616.0, - "5295": 1032653440.0, - "5300": 1024367872.0, - "5305": 1029634368.0, - "5310": 1033197312.0, - "5315": 1032988992.0, - "5320": 1019521664.0, - "5325": 1022718336.0, - "5330": 1021335168.0, - "5335": 1039275776.0, - "5340": 1037219648.0, - "5345": 1039188096.0, - "5350": 1023701888.0, - "5355": 1029935872.0, - "5360": 1047046080.0, - "5365": 1037426432.0, - "5370": 1024381568.0, - "5375": 1042070656.0, - "5380": 1020368384.0, - "5385": 1021765696.0, - "5390": 1035133184.0, - "5395": 1049653568.0, - "5400": 1026015744.0, - "5405": 1036453120.0, - "5410": 1027635776.0, - "5415": 1042285824.0, - "5420": 1039941888.0, - "5425": 1028381184.0, - "5430": 1043799808.0, - "5435": 1032653312.0, - "5440": 1033384448.0, - "5445": 1034144640.0, - "5450": 1025299328.0, - "5455": 1034079424.0, - "5460": 1026812416.0, - "5465": 1027399552.0, - "5470": 1028969216.0, - "5475": 1037233920.0, - "5480": 1023830272.0, - "5485": 1019186752.0, - "5490": 1030891520.0, - "5495": 1029399424.0, - "5500": 1032681216.0, - "5505": 1018275200.0, - "5510": 1023987648.0, - "5515": 1025156032.0, - "5520": 1039527296.0, - "5525": 1018024576.0, - "5530": 1037663936.0, - "5535": 1031599232.0, - "5540": 1027564544.0, - "5545": 1033212160.0, - "5550": 1032115968.0, - "5555": 1044802304.0, - "5560": 1028511232.0, - "5565": 1029686016.0, - "5570": 1042027776.0, - "5575": 1025379392.0, - "5580": 1023716736.0, - "5585": 1044093696.0, - "5590": 1041319936.0, - "5595": 1031549824.0, - "5600": 1023400320.0, - "5605": 1040115456.0, - "5610": 1034087552.0, - "5615": 1021042816.0, - "5620": 1031004800.0, - "5625": 1030188544.0, - "5630": 1023502080.0, - "5635": 1026684096.0, - "5640": 1034589120.0, - "5645": 1018655744.0, - "5650": 1052378752.0, - "5655": 1048933504.0, - "5660": 1050077696.0, - "5665": 1033958144.0, - "5670": 1033750016.0, - "5675": 1025392640.0, - "5680": 1039378304.0, - "5685": 1033056576.0, - "5690": 1031464576.0, - "5695": 1021946368.0, - "5700": 1038065664.0, - "5705": 1043684736.0, - "5710": 1057231616.0, - "5715": 1014462848.0, - "5720": 1021258816.0, - "5725": 1041822272.0, - "5730": 1039454912.0, - "5735": 1025128576.0, - "5740": 1026045440.0, - "5745": 1036990208.0, - "5750": 1044552256.0, - "5755": 1011860416.0, - "5760": 1028389568.0, - "5765": 1028245504.0, - "5770": 1021530368.0, - "5775": 1051210240.0, - "5780": 1034984512.0, - "5785": 1037513920.0, - "5790": 1016957184.0, - "5795": 1027873536.0, - "5800": 1029780736.0, - "5805": 1050694912.0, - "5810": 1018478336.0, - "5815": 1036123520.0, - "5820": 1048408704.0, - "5825": 1030977920.0, - "5830": 1031572096.0, - "5835": 1034045440.0, - "5840": 1039843776.0, - "5845": 1021746048.0, - "5850": 1029807744.0, - "5855": 1038789376.0, - "5860": 1031436288.0, - "5865": 1026397568.0, - "5870": 1029861824.0, - "5875": 1032841856.0, - "5880": 1032675968.0, - "5885": 1024576128.0, - "5890": 1026798976.0, - "5895": 1015796160.0, - "5900": 1049707008.0, - "5905": 1025653248.0, - "5910": 1019150720.0, - "5915": 1042739136.0, - "5920": 1028047232.0, - "5925": 1034016448.0, - "5930": 1030963328.0, - "5935": 1038102784.0, - "5940": 1019172864.0, - "5945": 1025130112.0, - "5950": 1035530240.0, - "5955": 1050437184.0, - "5960": 1024548736.0, - "5965": 1029923712.0, - "5970": 1016427776.0, - "5975": 1036682752.0, - "5980": 1024118464.0, - "5985": 1035386624.0, - "5990": 1010550784.0, - "5995": 1047019200.0, - "6000": 1021245568.0, - "6005": 1040460416.0, - "6010": 1025358720.0, - "6015": 1050179072.0, - "6020": 1039514496.0, - "6025": 1030254592.0, - "6030": 1025931968.0, - "6035": 1021745408.0, - "6040": 1034117056.0, - "6045": 1028282112.0, - "6050": 1020112320.0, - "6055": 1040397056.0, - "6060": 1026347008.0, - "6065": 1022198400.0, - "6070": 1040668416.0, - "6075": 1046037440.0, - "6080": 1038583168.0, - "6085": 1041485568.0, - "6090": 1037205888.0, - "6095": 1036282880.0, - "6100": 1030454720.0, - "6105": 1019216640.0, - "6110": 1035357824.0, - "6115": 1019452544.0, - "6120": 1032188800.0, - "6125": 1020922624.0, - "6130": 1012013952.0, - "6135": 1038733824.0, - "6140": 1041736896.0, - "6145": 1041917056.0, - "6150": 1018958208.0, - "6155": 1024649344.0, - "6160": 1047972160.0, - "6165": 1050408832.0, - "6170": 1032505344.0, - "6175": 1045793664.0, - "6180": 1040067072.0, - "6185": 1029710464.0, - "6190": 1023293760.0, - "6195": 1050897728.0, - "6200": 1035035776.0, - "6205": 1036275584.0, - "6210": 1039772736.0, - "6215": 1033200256.0, - "6220": 1026162432.0, - "6225": 1036741120.0, - "6230": 1025144192.0, - "6235": 1019352832.0, - "6240": 1057104384.0, - "6245": 1018413952.0, - "6250": 1035337344.0, - "6255": 1025380992.0, - "6260": 1034863744.0, - "6265": 1027703424.0, - "6270": 1042116480.0, - "6275": 1037659008.0, - "6280": 1018270208.0, - "6285": 1032642304.0, - "6290": 1038598592.0, - "6295": 1031803456.0, - "6300": 1034635200.0, - "6305": 1011066624.0, - "6310": 1039458624.0, - "6315": 1030054272.0, - "6320": 1030534208.0, - "6325": 1038642496.0, - "6330": 1033908800.0, - "6335": 1032297856.0, - "6340": 1033544448.0, - "6345": 1031036416.0, - "6350": 1037451264.0, - "6355": 1028075968.0, - "6360": 1043313408.0, - "6365": 1025223808.0, - "6370": 1033939200.0, - "6375": 1036038720.0, - "6380": 1029108096.0, - "6385": 1025395072.0, - "6390": 1025517952.0, - "6395": 1048611584.0, - "6400": 1040734976.0, - "6405": 1024247936.0, - "6410": 1017489280.0, - "6415": 1042827072.0, - "6420": 1025202432.0, - "6425": 1027164928.0, - "6430": 1040568256.0, - "6435": 1022908800.0, - "6440": 1047994624.0, - "6445": 1036089088.0, - "6450": 1048532224.0, - "6455": 1037272320.0, - "6460": 1036750912.0, - "6465": 1033652032.0, - "6470": 1018135232.0, - "6475": 1034691648.0, - "6480": 1028994048.0, - "6485": 1033258880.0, - "6490": 1035638656.0, - "6495": 1024470016.0, - "6500": 1020572096.0, - "6505": 1059327104.0, - "6510": 1020472576.0, - "6515": 1018688064.0, - "6520": 1051470592.0, - "6525": 1035544512.0, - "6530": 1027897216.0, - "6535": 1022722240.0, - "6540": 1023273984.0, - "6545": 1033173120.0, - "6550": 1029488512.0, - "6555": 1029575296.0, - "6560": 1056438784.0, - "6565": 1054295040.0, - "6570": 1032319040.0, - "6575": 1041208320.0, - "6580": 1028134400.0, - "6585": 1036504832.0, - "6590": 1042456192.0, - "6595": 1038568832.0, - "6600": 1031388096.0, - "6605": 1045715456.0, - "6610": 1034713472.0, - "6615": 1015576448.0, - "6620": 1039115136.0, - "6625": 1054654208.0, - "6630": 1043092928.0, - "6635": 1032226304.0, - "6640": 1016738496.0, - "6645": 1016178816.0, - "6650": 1034692672.0, - "6655": 1031753472.0, - "6660": 1041401920.0, - "6665": 1024657984.0, - "6670": 1023820032.0, - "6675": 1038306176.0, - "6680": 1025624064.0, - "6685": 1045394048.0, - "6690": 1046390720.0, - "6695": 1027754368.0, - "6700": 1033473920.0, - "6705": 1038857152.0, - "6710": 1047485888.0, - "6715": 1043229440.0, - "6720": 1022995456.0, - "6725": 1018910144.0, - "6730": 1027525504.0, - "6735": 1016937856.0, - "6740": 1027238016.0, - "6745": 1030263680.0, - "6750": 1006373760.0, - "6755": 1034765056.0, - "6760": 1040735296.0, - "6765": 1023827008.0, - "6770": 1036441344.0, - "6775": 1019627712.0, - "6780": 1043723904.0, - "6785": 1037409280.0, - "6790": 1029403072.0, - "6795": 1026349440.0, - "6800": 1036628224.0, - "6805": 1024579712.0, - "6810": 1042340544.0, - "6815": 1035274112.0, - "6820": 1022594880.0, - "6825": 1034793344.0, - "6830": 1029862400.0, - "6835": 1041609600.0, - "6840": 1042283776.0, - "6845": 1018954624.0, - "6850": 1032171136.0, - "6855": 1034434752.0, - "6860": 1042054848.0, - "6865": 1021813568.0, - "6870": 1037015424.0, - "6875": 1030379968.0, - "6880": 1029360768.0, - "6885": 1030435968.0, - "6890": 1039890432.0, - "6895": 1027267712.0, - "6900": 1035174016.0, - "6905": 1043975424.0, - "6910": 1019763072.0, - "6915": 1017476608.0, - "6920": 1017184256.0, - "6925": 1030650688.0, - "6930": 1036672384.0, - "6935": 1042835712.0, - "6940": 1040313216.0, - "6945": 1044196992.0, - "6950": 1040513472.0, - "6955": 1036112704.0, - "6960": 1036436224.0, - "6965": 1019161024.0, - "6970": 1034729088.0, - "6975": 1019134464.0, - "6980": 1028436160.0, - "6985": 1023240128.0, - "6990": 1026994688.0, - "6995": 1027547520.0, - "7000": 1058819840.0, - "7005": 1013737856.0, - "7010": 1028959488.0, - "7015": 1037288768.0, - "7020": 1011880576.0, - "7025": 1017313280.0, - "7030": 1028301440.0, - "7035": 1035955392.0, - "7040": 1042966016.0, - "7045": 1028185856.0, - "7050": 1017979584.0, - "7055": 1035088000.0, - "7060": 1051802624.0, - "7065": 1007664640.0, - "7070": 1035819008.0, - "7075": 1031039552.0, - "7080": 1026143296.0, - "7085": 1044906432.0, - "7090": 1046261760.0, - "7095": 1043760512.0, - "7100": 1035089024.0, - "7105": 1049143296.0, - "7110": 1010962944.0, - "7115": 1033869504.0, - "7120": 1031267456.0, - "7125": 1037496832.0, - "7130": 1024881856.0, - "7135": 1031991808.0, - "7140": 1019090176.0, - "7145": 1033081088.0, - "7150": 1037554112.0, - "7155": 1015729728.0, - "7160": 1024724608.0, - "7165": 1030895808.0, - "7170": 1037367808.0, - "7175": 1028816896.0, - "7180": 1037633280.0, - "7185": 1016174080.0, - "7190": 1019808128.0, - "7195": 1040915392.0, - "7200": 1041375360.0, - "7205": 1026538240.0, - "7210": 1022638720.0, - "7215": 1041890560.0, - "7220": 1017742720.0, - "7225": 1027296640.0, - "7230": 1030200448.0, - "7235": 1035726848.0, - "7240": 1037854848.0, - "7245": 1023971008.0, - "7250": 1044708096.0, - "7255": 1031900480.0, - "7260": 1030128256.0, - "7265": 1036887104.0, - "7270": 1050097152.0, - "7275": 1029225216.0, - "7280": 1020231808.0, - "7285": 1029842048.0, - "7290": 1017219328.0, - "7295": 1029139584.0, - "7300": 1031533824.0, - "7305": 1027298176.0, - "7310": 1029089664.0, - "7315": 1022782272.0, - "7320": 1036458176.0, - "7325": 1036851840.0, - "7330": 1021706496.0, - "7335": 1030715904.0, - "7340": 1039382976.0, - "7345": 1040177664.0, - "7350": 1034973568.0, - "7355": 1033656320.0, - "7360": 1031254912.0, - "7365": 1048742016.0, - "7370": 1027298304.0, - "7375": 1041854848.0, - "7380": 1016725760.0, - "7385": 1017578368.0, - "7390": 1017234944.0, - "7395": 1046793600.0, - "7400": 1048441216.0, - "7405": 1013394304.0, - "7410": 1017386368.0, - "7415": 1017815360.0, - "7420": 1028043008.0, - "7425": 1012840576.0, - "7430": 1034042368.0, - "7435": 1032530432.0, - "7440": 1002692928.0, - "7445": 1034451200.0, - "7450": 1039304832.0, - "7455": 1019027008.0, - "7460": 1014740928.0, - "7465": 1027204736.0, - "7470": 1030422784.0, - "7475": 1033792064.0, - "7480": 1043317376.0, - "7485": 1038215168.0, - "7490": 1049000960.0, - "7495": 1028982720.0, - "7500": 1027426816.0, - "7505": 1028695936.0, - "7510": 1048886528.0, - "7515": 1035648704.0, - "7520": 1017198848.0, - "7525": 1036572736.0, - "7530": 1029261952.0, - "7535": 1027190144.0, - "7540": 1028338048.0, - "7545": 1025986304.0, - "7550": 1023025856.0, - "7555": 1033025344.0, - "7560": 1031404672.0, - "7565": 1022710528.0, - "7570": 1037591552.0, - "7575": 1022603136.0, - "7580": 1018123584.0, - "7585": 1033054208.0, - "7590": 1010993280.0, - "7595": 1018260352.0, - "7600": 1049904448.0, - "7605": 1037361216.0, - "7610": 1040415744.0, - "7615": 1035247488.0, - "7620": 1024230912.0, - "7625": 1020317184.0, - "7630": 1034939584.0, - "7635": 1043224192.0, - "7640": 1033491520.0, - "7645": 1034444608.0, - "7650": 1039804800.0, - "7655": 1031240576.0, - "7660": 1056628096.0, - "7665": 1031076096.0, - "7670": 1033685120.0, - "7675": 1030681600.0, - "7680": 1035398720.0, - "7685": 1018661760.0, - "7690": 1031921024.0, - "7695": 1025858880.0, - "7700": 1017715200.0, - "7705": 1036531200.0, - "7710": 1029893248.0, - "7715": 1053230656.0, - "7720": 1019514240.0, - "7725": 1042193216.0, - "7730": 1035620992.0, - "7735": 1020726144.0, - "7740": 1045576128.0, - "7745": 1026932992.0, - "7750": 1048550208.0, - "7755": 1022539264.0, - "7760": 1049532032.0, - "7765": 1029370176.0, - "7770": 1018375296.0, - "7775": 1021364672.0, - "7780": 1039770624.0, - "7785": 1039914112.0, - "7790": 1030516992.0, - "7795": 1039353728.0, - "7800": 1028187904.0, - "7805": 1027635776.0, - "7810": 1020970368.0, - "7815": 1035878400.0, - "7820": 1017666240.0, - "7825": 1018067392.0, - "7830": 1035104128.0, - "7835": 1044507648.0, - "7840": 1027836224.0, - "7845": 1032101504.0, - "7850": 1034609408.0, - "7855": 1025464832.0, - "7860": 1059051648.0, - "7865": 1016626240.0, - "7870": 1033729408.0, - "7875": 1044185600.0, - "7880": 1029084352.0, - "7885": 1040308288.0, - "7890": 1029556480.0, - "7895": 1032947008.0, - "7900": 1021409216.0, - "7905": 1020955904.0, - "7910": 1008993856.0, - "7915": 1023120768.0, - "7920": 1023070976.0, - "7925": 1030094080.0, - "7930": 1020712704.0, - "7935": 1019443776.0, - "7940": 1017809152.0, - "7945": 1014447552.0, - "7950": 1026303616.0, - "7955": 1034518272.0, - "7960": 1056026304.0, - "7965": 1031047872.0, - "7970": 1030417152.0, - "7975": 1022189888.0, - "7980": 1034474624.0, - "7985": 1047305024.0, - "7990": 1032066176.0, - "7995": 1044264704.0, - "8000": 1028876672.0, - "8005": 1028045440.0, - "8010": 1050665408.0, - "8015": 1019758976.0, - "8020": 1043297408.0, - "8025": 1039018560.0, - "8030": 1030868800.0, - "8035": 1045304192.0, - "8040": 1026310784.0, - "8045": 1024970368.0, - "8050": 1018405632.0, - "8055": 1033736960.0, - "8060": 1012986816.0, - "8065": 1022016640.0, - "8070": 1034776064.0, - "8075": 1042759616.0, - "8080": 1027758784.0, - "8085": 1037205376.0, - "8090": 1007008256.0, - "8095": 1030374528.0, - "8100": 1030726016.0, - "8105": 1027794944.0, - "8110": 1031557248.0, - "8115": 1037685248.0, - "8120": 1037692992.0, - "8125": 1031097472.0, - "8130": 1028627072.0, - "8135": 1029680256.0, - "8140": 1049904256.0, - "8145": 1043463552.0, - "8150": 1040087424.0, - "8155": 1046780288.0, - "8160": 1010199040.0, - "8165": 1031657728.0, - "8170": 1024483264.0, - "8175": 1035019648.0, - "8180": 1024460544.0, - "8185": 1021960448.0, - "8190": 1037125504.0, - "8195": 1022368384.0, - "8200": 1035635968.0, - "8205": 1026482496.0, - "8210": 1023888000.0, - "8215": 1014276416.0, - "8220": 1026756224.0, - "8225": 1028540160.0, - "8230": 1027163072.0, - "8235": 1037914048.0, - "8240": 1025909376.0, - "8245": 1024676608.0, - "8250": 1041635840.0, - "8255": 1031908224.0, - "8260": 1032424512.0, - "8265": 1023164800.0, - "8270": 1040172544.0, - "8275": 1038050688.0, - "8280": 1041849216.0, - "8285": 1038804352.0, - "8290": 1024074880.0, - "8295": 1028403648.0, - "8300": 1039341440.0, - "8305": 1012104192.0, - "8310": 1021882048.0, - "8315": 1027307200.0, - "8320": 1021636992.0, - "8325": 1048572160.0, - "8330": 1041039616.0, - "8335": 1037964928.0, - "8340": 1033019136.0, - "8345": 1043864192.0, - "8350": 1037713792.0, - "8355": 1029686400.0, - "8360": 1040667776.0, - "8365": 1027450304.0, - "8370": 1037742848.0, - "8375": 1041986944.0, - "8380": 1037628416.0, - "8385": 1023436160.0, - "8390": 1026068224.0, - "8395": 1028913408.0, - "8400": 1046530560.0, - "8405": 1040179456.0, - "8410": 1034252672.0, - "8415": 1040258688.0, - "8420": 1054730752.0, - "8425": 1031514880.0, - "8430": 1030295680.0, - "8435": 1045707200.0, - "8440": 1026310784.0, - "8445": 1029027392.0, - "8450": 1034201920.0, - "8455": 1031794688.0, - "8460": 1016828032.0, - "8465": 1035163648.0, - "8470": 1035185152.0, - "8475": 1024712960.0, - "8480": 1035901184.0, - "8485": 1028948480.0, - "8490": 1023079168.0, - "8495": 1037393280.0, - "8500": 1025960064.0, - "8505": 1042724992.0, - "8510": 1028167936.0, - "8515": 1038101056.0, - "8520": 1023107328.0, - "8525": 1037987328.0, - "8530": 1027572800.0, - "8535": 1041656128.0, - "8540": 1033880960.0, - "8545": 1015116160.0, - "8550": 1040188160.0, - "8555": 1016340672.0, - "8560": 1019330048.0, - "8565": 1021410112.0, - "8570": 1032032320.0, - "8575": 1031880128.0, - "8580": 1016011264.0, - "8585": 1030017408.0, - "8590": 1031637248.0, - "8595": 1017776128.0, - "8600": 1002393216.0, - "8605": 1030238336.0, - "8610": 1017532288.0, - "8615": 1023989248.0, - "8620": 1047205696.0, - "8625": 1034231552.0, - "8630": 1030921280.0, - "8635": 1051992512.0, - "8640": 1041134208.0, - "8645": 1024870720.0, - "8650": 1025595392.0, - "8655": 1036904832.0, - "8660": 1031171200.0, - "8665": 1032904640.0, - "8670": 1037400576.0, - "8675": 1029157248.0, - "8680": 1031264704.0, - "8685": 1041197568.0, - "8690": 1035035392.0, - "8695": 1008508416.0, - "8700": 1027459072.0, - "8705": 1051504896.0, - "8710": 1041678016.0, - "8715": 1034152256.0, - "8720": 1017596544.0, - "8725": 1025187456.0, - "8730": 1036610816.0, - "8735": 1014829568.0, - "8740": 1036081536.0, - "8745": 1021252416.0, - "8750": 1027866496.0, - "8755": 1020742272.0, - "8760": 1036899712.0, - "8765": 1058672448.0, - "8770": 1020462464.0, - "8775": 1031773056.0, - "8780": 1030892544.0, - "8785": 1032117504.0, - "8790": 1041034112.0, - "8795": 1019523968.0, - "8800": 1038245632.0, - "8805": 1035106752.0, - "8810": 1043257088.0, - "8815": 1026490496.0, - "8820": 1027666944.0, - "8825": 1043464064.0, - "8830": 1027480192.0, - "8835": 1038812928.0, - "8840": 1034490752.0, - "8845": 1033909760.0, - "8850": 1030491008.0, - "8855": 1042524992.0, - "8860": 1013002880.0, - "8865": 1038368128.0, - "8870": 1025187456.0, - "8875": 1012981760.0, - "8880": 1028376704.0, - "8885": 1046461056.0, - "8890": 1038603840.0, - "8895": 1037909504.0, - "8900": 1027294848.0, - "8905": 1032792064.0, - "8910": 1029795264.0, - "8915": 1030003968.0, - "8920": 1030339968.0, - "8925": 1028569984.0, - "8930": 1031637376.0, - "8935": 1022951424.0, - "8940": 1019847872.0, - "8945": 1031909248.0, - "8950": 1039951744.0, - "8955": 1041902720.0, - "8960": 1026878464.0, - "8965": 1022083968.0, - "8970": 1029559424.0, - "8975": 1038934400.0, - "8980": 1033860160.0, - "8985": 1030649472.0, - "8990": 1025014144.0, - "8995": 1013963648.0, - "9000": 1035286400.0, - "9005": 1028649280.0, - "9010": 1011913280.0, - "9015": 1038912128.0, - "9020": 1030153856.0, - "9025": 1024685056.0, - "9030": 1025861888.0, - "9035": 1054309248.0, - "9040": 1027293952.0, - "9045": 1036583040.0, - "9050": 1020929664.0, - "9055": 1043212800.0, - "9060": 1023159104.0, - "9065": 1023387520.0, - "9070": 1039364480.0, - "9075": 1026728320.0, - "9080": 1018873408.0, - "9085": 1015439104.0, - "9090": 1043764736.0, - "9095": 1014020224.0, - "9100": 1031975296.0, - "9105": 1026514304.0, - "9110": 1029229568.0, - "9115": 1024866432.0, - "9120": 999986240.0, - "9125": 1032842752.0, - "9130": 1038534336.0, - "9135": 1031037696.0, - "9140": 1025502208.0, - "9145": 1030405248.0, - "9150": 1029416576.0, - "9155": 1038268928.0, - "9160": 1046043904.0, - "9165": 1017948992.0, - "9170": 1040955520.0, - "9175": 1031287552.0, - "9180": 1037830656.0, - "9185": 1040684416.0, - "9190": 1028985728.0, - "9195": 1034312320.0, - "9200": 1035551872.0, - "9205": 1029847040.0, - "9210": 1026535872.0, - "9215": 1030520448.0, - "9220": 1025732224.0, - "9225": 1048001408.0, - "9230": 1041601792.0, - "9235": 1027775104.0, - "9240": 1025245760.0, - "9245": 1036211584.0, - "9250": 1041192384.0, - "9255": 1020063872.0, - "9260": 1035337984.0, - "9265": 1023102208.0, - "9270": 1038332928.0, - "9275": 1036053568.0, - "9280": 1026541504.0, - "9285": 1014285184.0, - "9290": 1018866304.0, - "9295": 1026915264.0, - "9300": 1037085888.0, - "9305": 1045435392.0, - "9310": 1033242944.0, - "9315": 1039043840.0, - "9320": 1048495488.0, - "9325": 1023059840.0, - "9330": 1031724672.0, - "9335": 1035673472.0, - "9340": 1013719296.0, - "9345": 1022572032.0, - "9350": 1026585600.0, - "9355": 1034807104.0, - "9360": 1029839552.0, - "9365": 1019863296.0, - "9370": 1006904320.0, - "9375": 1036232960.0, - "9380": 1049012736.0, - "9385": 1015905344.0, - "9390": 1029208704.0, - "9395": 1008931968.0, - "9400": 1026893568.0, - "9405": 1027653312.0, - "9410": 1040913280.0, - "9415": 1035128576.0, - "9420": 1030792640.0, - "9425": 1027581056.0, - "9430": 1032727360.0, - "9435": 1031796288.0, - "9440": 1051730048.0, - "9445": 1019626752.0, - "9450": 1044505152.0, - "9455": 1035773696.0, - "9460": 1013828224.0, - "9465": 1023403904.0, - "9470": 1023576832.0, - "9475": 1039164416.0, - "9480": 1029597056.0, - "9485": 1032075200.0, - "9490": 1020994560.0, - "9495": 1021375616.0, - "9500": 1035594304.0, - "9505": 1034478464.0, - "9510": 1014286592.0, - "9515": 1031309312.0, - "9520": 1026563904.0, - "9525": 1035853184.0, - "9530": 1031624448.0, - "9535": 1025926720.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 33307314176.0, - "5": 33307424768.0, - "10": 33307447296.0, - "15": 33307439104.0, - "20": 33307533312.0, - "25": 33307473920.0, - "30": 33307504640.0, - "35": 33307639808.0, - "40": 33307637760.0, - "45": 33307568128.0, - "50": 33307418624.0, - "55": 33307326464.0, - "60": 33307346944.0, - "65": 33307490304.0, - "70": 33307312128.0, - "75": 33307308032.0, - "80": 33307404288.0, - "85": 33307314176.0, - "90": 33307285504.0, - "95": 33307392000.0, - "100": 33307260928.0, - "105": 33307129856.0, - "110": 33307037696.0, - "115": 33306703872.0, - "120": 33307355136.0, - "125": 33306873856.0, - "130": 33307017216.0, - "135": 33307305984.0, - "140": 33307004928.0, - "145": 33307121664.0, - "150": 33307312128.0, - "155": 33307176960.0, - "160": 33307103232.0, - "165": 33307174912.0, - "170": 33307832320.0, - "175": 33307199488.0, - "180": 33307355136.0, - "185": 33307355136.0, - "190": 33307131904.0, - "195": 33307256832.0, - "200": 33307326464.0, - "205": 33307492352.0, - "210": 33307500544.0, - "215": 33307086848.0, - "220": 33306857472.0, - "225": 33306933248.0, - "230": 33307092992.0, - "235": 33307183104.0, - "240": 33307303936.0, - "245": 33307426816.0, - "250": 33307308032.0, - "255": 33307295744.0, - "260": 33306767360.0, - "265": 33307461632.0, - "270": 33307467776.0, - "275": 33307469824.0, - "280": 33307254784.0, - "285": 33307947008.0, - "290": 33307191296.0, - "295": 33308014592.0, - "300": 33307856896.0, - "305": 33308340224.0, - "310": 33307815936.0, - "315": 33307181056.0, - "320": 33307512832.0, - "325": 33307488256.0, - "330": 33307977728.0, - "335": 33307947008.0, - "340": 33308606464.0, - "345": 33308037120.0, - "350": 33307693056.0, - "355": 33308000256.0, - "360": 33307348992.0, - "365": 33307451392.0, - "370": 33308000256.0, - "375": 33307283456.0, - "380": 33307570176.0, - "385": 33307860992.0, - "390": 33307416576.0, - "395": 33307031552.0, - "400": 33307246592.0, - "405": 33307676672.0, - "410": 33306935296.0, - "415": 33307752448.0, - "420": 33307529216.0, - "425": 33307314176.0, - "430": 33306988544.0, - "435": 33307455488.0, - "440": 33307369472.0, - "445": 33307709440.0, - "450": 33307588608.0, - "455": 33306963968.0, - "460": 33307193344.0, - "465": 33306845184.0, - "470": 33307766784.0, - "475": 33306464256.0, - "480": 33307566080.0, - "485": 33307682816.0, - "490": 33307389952.0, - "495": 33307179008.0, - "500": 33307969536.0, - "505": 33307629568.0, - "510": 33308192768.0, - "515": 33307279360.0, - "520": 33306544128.0, - "525": 33307265024.0, - "530": 33307025408.0, - "535": 33307648000.0, - "540": 33307582464.0, - "545": 33307297792.0, - "550": 33307396096.0, - "555": 33307301888.0, - "560": 33307899904.0, - "565": 33307379712.0, - "570": 33307553792.0, - "575": 33307136000.0, - "580": 33305892864.0, - "585": 33306945536.0, - "590": 33307629568.0, - "595": 33307860992.0, - "600": 33306873856.0, - "605": 33307357184.0, - "610": 33306556416.0, - "615": 33306349568.0, - "620": 33307791360.0, - "625": 33306378240.0, - "630": 33307168768.0, - "635": 33306767360.0, - "640": 33306116096.0, - "645": 33308092416.0, - "650": 33307277312.0, - "655": 33307131904.0, - "660": 33308485632.0, - "665": 33307334656.0, - "670": 33307959296.0, - "675": 33307701248.0, - "680": 33306863616.0, - "685": 33306697728.0, - "690": 33307863040.0, - "695": 33307293696.0, - "700": 33306263552.0, - "705": 33306955776.0, - "710": 33308225536.0, - "715": 33307174912.0, - "720": 33307107328.0, - "725": 33307324416.0, - "730": 33308231680.0, - "735": 33307224064.0, - "740": 33307815936.0, - "745": 33307938816.0, - "750": 33307779072.0, - "755": 33308463104.0, - "760": 33306349568.0, - "765": 33308266496.0, - "770": 33306603520.0, - "775": 33307424768.0, - "780": 33308608512.0, - "785": 33307969536.0, - "790": 33308188672.0, - "795": 33307656192.0, - "800": 33307547648.0, - "805": 33307619328.0, - "810": 33307910144.0, - "815": 33307170816.0, - "820": 33307029504.0, - "825": 33307443200.0, - "830": 33307422720.0, - "835": 33307262976.0, - "840": 33307613184.0, - "845": 33307928576.0, - "850": 33306238976.0, - "855": 33307396096.0, - "860": 33307938816.0, - "865": 33307701248.0, - "870": 33307940864.0, - "875": 33307545600.0, - "880": 33307527168.0, - "885": 33307336704.0, - "890": 33308262400.0, - "895": 33307717632.0, - "900": 33306474496.0, - "905": 33307480064.0, - "910": 33307725824.0, - "915": 33308303360.0, - "920": 33307770880.0, - "925": 33307566080.0, - "930": 33307451392.0, - "935": 33307975680.0, - "940": 33306320896.0, - "945": 33306429440.0, - "950": 33307136000.0, - "955": 33307846656.0, - "960": 33307611136.0, - "965": 33307465728.0, - "970": 33308293120.0, - "975": 33307078656.0, - "980": 33307568128.0, - "985": 33307080704.0, - "990": 33307367424.0, - "995": 33306861568.0, - "1000": 33307889664.0, - "1005": 33305956352.0, - "1010": 33307508736.0, - "1015": 33306671104.0, - "1020": 33306669056.0, - "1025": 33306509312.0, - "1030": 33307117568.0, - "1035": 33308332032.0, - "1040": 33307353088.0, - "1045": 33308368896.0, - "1050": 33306615808.0, - "1055": 33306802176.0, - "1060": 33307103232.0, - "1065": 33307404288.0, - "1070": 33307070464.0, - "1075": 33308188672.0, - "1080": 33307011072.0, - "1085": 33307027456.0, - "1090": 33308086272.0, - "1095": 33307086848.0, - "1100": 33307287552.0, - "1105": 33308497920.0, - "1110": 33307461632.0, - "1115": 33307533312.0, - "1120": 33307777024.0, - "1125": 33307809792.0, - "1130": 33307484160.0, - "1135": 33308082176.0, - "1140": 33307029504.0, - "1145": 33307432960.0, - "1150": 33307574272.0, - "1155": 33307551744.0, - "1160": 33307561984.0, - "1165": 33307086848.0, - "1170": 33307856896.0, - "1175": 33306976256.0, - "1180": 33308237824.0, - "1185": 33307875328.0, - "1190": 33307369472.0, - "1195": 33308231680.0, - "1200": 33307197440.0, - "1205": 33307480064.0, - "1210": 33305866240.0, - "1215": 33308297216.0, - "1220": 33307451392.0, - "1225": 33307518976.0, - "1230": 33307688960.0, - "1235": 33307901952.0, - "1240": 33307394048.0, - "1245": 33307842560.0, - "1250": 33307281408.0, - "1255": 33306906624.0, - "1260": 33307301888.0, - "1265": 33307674624.0, - "1270": 33307150336.0, - "1275": 33307686912.0, - "1280": 33307430912.0, - "1285": 33306974208.0, - "1290": 33307529216.0, - "1295": 33307901952.0, - "1300": 33307002880.0, - "1305": 33308059648.0, - "1310": 33306939392.0, - "1315": 33307336704.0, - "1320": 33307262976.0, - "1325": 33307011072.0, - "1330": 33306550272.0, - "1335": 33307181056.0, - "1340": 33307406336.0, - "1345": 33307463680.0, - "1350": 33308135424.0, - "1355": 33307480064.0, - "1360": 33307533312.0, - "1365": 33307066368.0, - "1370": 33306595328.0, - "1375": 33307891712.0, - "1380": 33307830272.0, - "1385": 33308487680.0, - "1390": 33306521600.0, - "1395": 33307338752.0, - "1400": 33308430336.0, - "1405": 33307768832.0, - "1410": 33308041216.0, - "1415": 33307797504.0, - "1420": 33306605568.0, - "1425": 33307240448.0, - "1430": 33307322368.0, - "1435": 33307559936.0, - "1440": 33306662912.0, - "1445": 33307058176.0, - "1450": 33307705344.0, - "1455": 33307291648.0, - "1460": 33306861568.0, - "1465": 33306312704.0, - "1470": 33307394048.0, - "1475": 33307211776.0, - "1480": 33306527744.0, - "1485": 33307361280.0, - "1490": 33307693056.0, - "1495": 33307271168.0, - "1500": 33306820608.0, - "1505": 33307092992.0, - "1510": 33306624000.0, - "1515": 33307097088.0, - "1520": 33306931200.0, - "1525": 33307635712.0, - "1530": 33307353088.0, - "1535": 33306468352.0, - "1540": 33307172864.0, - "1545": 33307693056.0, - "1550": 33307938816.0, - "1555": 33307832320.0, - "1560": 33308182528.0, - "1565": 33307099136.0, - "1570": 33306798080.0, - "1575": 33307492352.0, - "1580": 33307688960.0, - "1585": 33307326464.0, - "1590": 33306988544.0, - "1595": 33306818560.0, - "1600": 33307836416.0, - "1605": 33307590656.0, - "1610": 33307168768.0, - "1615": 33306931200.0, - "1620": 33306732544.0, - "1625": 33308260352.0, - "1630": 33308227584.0, - "1635": 33306957824.0, - "1640": 33306759168.0, - "1645": 33306021888.0, - "1650": 33306689536.0, - "1655": 33307332608.0, - "1660": 33307170816.0, - "1665": 33306583040.0, - "1670": 33307535360.0, - "1675": 33306912768.0, - "1680": 33306675200.0, - "1685": 33307774976.0, - "1690": 33307783168.0, - "1695": 33307971584.0, - "1700": 33307623424.0, - "1705": 33307652096.0, - "1710": 33307731968.0, - "1715": 33308090368.0, - "1720": 33307172864.0, - "1725": 33307672576.0, - "1730": 33306355712.0, - "1735": 33308229632.0, - "1740": 33307142144.0, - "1745": 33308151808.0, - "1750": 33306898432.0, - "1755": 33307105280.0, - "1760": 33308000256.0, - "1765": 33307750400.0, - "1770": 33308450816.0, - "1775": 33308184576.0, - "1780": 33308129280.0, - "1785": 33307936768.0, - "1790": 33307238400.0, - "1795": 33307922432.0, - "1800": 33306900480.0, - "1805": 33307203584.0, - "1810": 33306923008.0, - "1815": 33307617280.0, - "1820": 33307664384.0, - "1825": 33308440576.0, - "1830": 33306843136.0, - "1835": 33307979776.0, - "1840": 33307588608.0, - "1845": 33307602944.0, - "1850": 33307774976.0, - "1855": 33307529216.0, - "1860": 33307054080.0, - "1865": 33307097088.0, - "1870": 33307373568.0, - "1875": 33306265600.0, - "1880": 33307275264.0, - "1885": 33307224064.0, - "1890": 33307324416.0, - "1895": 33307283456.0, - "1900": 33306810368.0, - "1905": 33307191296.0, - "1910": 33306884096.0, - "1915": 33308162048.0, - "1920": 33307664384.0, - "1925": 33305972736.0, - "1930": 33308504064.0, - "1935": 33307377664.0, - "1940": 33307119616.0, - "1945": 33307416576.0, - "1950": 33307746304.0, - "1955": 33307420672.0, - "1960": 33308073984.0, - "1965": 33307148288.0, - "1970": 33306775552.0, - "1975": 33308207104.0, - "1980": 33307473920.0, - "1985": 33307095040.0, - "1990": 33307527168.0, - "1995": 33307037696.0, - "2000": 33308801024.0, - "2005": 33307985920.0, - "2010": 33307516928.0, - "2015": 33307604992.0, - "2020": 33307406336.0, - "2025": 33307719680.0, - "2030": 33308381184.0, - "2035": 33307914240.0, - "2040": 33307324416.0, - "2045": 33306476544.0, - "2050": 33308246016.0, - "2055": 33307430912.0, - "2060": 33307912192.0, - "2065": 33307543552.0, - "2070": 33307670528.0, - "2075": 33307482112.0, - "2080": 33307871232.0, - "2085": 33306722304.0, - "2090": 33307549696.0, - "2095": 33307260928.0, - "2100": 33306765312.0, - "2105": 33306847232.0, - "2110": 33307332608.0, - "2115": 33306480640.0, - "2120": 33307168768.0, - "2125": 33307277312.0, - "2130": 33307314176.0, - "2135": 33307752448.0, - "2140": 33306710016.0, - "2145": 33307478016.0, - "2150": 33307729920.0, - "2155": 33306943488.0, - "2160": 33307508736.0, - "2165": 33307049984.0, - "2170": 33307158528.0, - "2175": 33306599424.0, - "2180": 33307054080.0, - "2185": 33307017216.0, - "2190": 33307119616.0, - "2195": 33307289600.0, - "2200": 33306726400.0, - "2205": 33306636288.0, - "2210": 33307639808.0, - "2215": 33308215296.0, - "2220": 33307314176.0, - "2225": 33307437056.0, - "2230": 33306318848.0, - "2235": 33306941440.0, - "2240": 33308131328.0, - "2245": 33307707392.0, - "2250": 33307256832.0, - "2255": 33306845184.0, - "2260": 33307736064.0, - "2265": 33308620800.0, - "2270": 33307357184.0, - "2275": 33308151808.0, - "2280": 33307981824.0, - "2285": 33307922432.0, - "2290": 33306767360.0, - "2295": 33307670528.0, - "2300": 33307179008.0, - "2305": 33307545600.0, - "2310": 33307924480.0, - "2315": 33307396096.0, - "2320": 33307725824.0, - "2325": 33308024832.0, - "2330": 33307793408.0, - "2335": 33307019264.0, - "2340": 33307162624.0, - "2345": 33307934720.0, - "2350": 33306232832.0, - "2355": 33307719680.0, - "2360": 33307375616.0, - "2365": 33306537984.0, - "2370": 33307279360.0, - "2375": 33308131328.0, - "2380": 33307136000.0, - "2385": 33307490304.0, - "2390": 33307316224.0, - "2395": 33306587136.0, - "2400": 33307594752.0, - "2405": 33308393472.0, - "2410": 33306726400.0, - "2415": 33307506688.0, - "2420": 33308407808.0, - "2425": 33307942912.0, - "2430": 33308116992.0, - "2435": 33307308032.0, - "2440": 33308362752.0, - "2445": 33308071936.0, - "2450": 33307740160.0, - "2455": 33307959296.0, - "2460": 33308258304.0, - "2465": 33307299840.0, - "2470": 33307056128.0, - "2475": 33307224064.0, - "2480": 33307713536.0, - "2485": 33306550272.0, - "2490": 33306992640.0, - "2495": 33307232256.0, - "2500": 33307095040.0, - "2505": 33307107328.0, - "2510": 33307488256.0, - "2515": 33308360704.0, - "2520": 33307369472.0, - "2525": 33306959872.0, - "2530": 33307258880.0, - "2535": 33307082752.0, - "2540": 33308633088.0, - "2545": 33308542976.0, - "2550": 33308002304.0, - "2555": 33307961344.0, - "2560": 33307328512.0, - "2565": 33308299264.0, - "2570": 33307770880.0, - "2575": 33307877376.0, - "2580": 33307990016.0, - "2585": 33308016640.0, - "2590": 33308135424.0, - "2595": 33307617280.0, - "2600": 33306667008.0, - "2605": 33307422720.0, - "2610": 33306683392.0, - "2615": 33308669952.0, - "2620": 33308616704.0, - "2625": 33308366848.0, - "2630": 33307574272.0, - "2635": 33308166144.0, - "2640": 33307983872.0, - "2645": 33307609088.0, - "2650": 33307807744.0, - "2655": 33306955776.0, - "2660": 33307273216.0, - "2665": 33307709440.0, - "2670": 33307693056.0, - "2675": 33307731968.0, - "2680": 33308227584.0, - "2685": 33307742208.0, - "2690": 33307734016.0, - "2695": 33307424768.0, - "2700": 33306644480.0, - "2705": 33306300416.0, - "2710": 33307881472.0, - "2715": 33307488256.0, - "2720": 33307318272.0, - "2725": 33307604992.0, - "2730": 33306710016.0, - "2735": 33308049408.0, - "2740": 33307437056.0, - "2745": 33307572224.0, - "2750": 33307136000.0, - "2755": 33307584512.0, - "2760": 33307355136.0, - "2765": 33307713536.0, - "2770": 33308000256.0, - "2775": 33306460160.0, - "2780": 33306923008.0, - "2785": 33307017216.0, - "2790": 33306720256.0, - "2795": 33307785216.0, - "2800": 33307234304.0, - "2805": 33306685440.0, - "2810": 33307469824.0, - "2815": 33308069888.0, - "2820": 33306460160.0, - "2825": 33307467776.0, - "2830": 33307666432.0, - "2835": 33307371520.0, - "2840": 33306904576.0, - "2845": 33308061696.0, - "2850": 33308520448.0, - "2855": 33307695104.0, - "2860": 33308487680.0, - "2865": 33307058176.0, - "2870": 33307303936.0, - "2875": 33307324416.0, - "2880": 33306968064.0, - "2885": 33307641856.0, - "2890": 33307785216.0, - "2895": 33308221440.0, - "2900": 33307596800.0, - "2905": 33307533312.0, - "2910": 33307459584.0, - "2915": 33307799552.0, - "2920": 33308461056.0, - "2925": 33307938816.0, - "2930": 33308268544.0, - "2935": 33308594176.0, - "2940": 33308170240.0, - "2945": 33307578368.0, - "2950": 33307590656.0, - "2955": 33308131328.0, - "2960": 33306839040.0, - "2965": 33307111424.0, - "2970": 33307570176.0, - "2975": 33307766784.0, - "2980": 33307600896.0, - "2985": 33307123712.0, - "2990": 33307641856.0, - "2995": 33307527168.0, - "3000": 33307863040.0, - "3005": 33306927104.0, - "3010": 33307738112.0, - "3015": 33308217344.0, - "3020": 33306697728.0, - "3025": 33306970112.0, - "3030": 33308127232.0, - "3035": 33308213248.0, - "3040": 33307578368.0, - "3045": 33308327936.0, - "3050": 33306910720.0, - "3055": 33307004928.0, - "3060": 33307602944.0, - "3065": 33306970112.0, - "3070": 33307985920.0, - "3075": 33306945536.0, - "3080": 33307312128.0, - "3085": 33306533888.0, - "3090": 33306933248.0, - "3095": 33307906048.0, - "3100": 33306793984.0, - "3105": 33307127808.0, - "3110": 33308295168.0, - "3115": 33307295744.0, - "3120": 33307897856.0, - "3125": 33307066368.0, - "3130": 33307781120.0, - "3135": 33307762688.0, - "3140": 33308196864.0, - "3145": 33306904576.0, - "3150": 33307140096.0, - "3155": 33306660864.0, - "3160": 33307514880.0, - "3165": 33307246592.0, - "3170": 33307613184.0, - "3175": 33307375616.0, - "3180": 33307551744.0, - "3185": 33307842560.0, - "3190": 33308342272.0, - "3195": 33308350464.0, - "3200": 33307799552.0, - "3205": 33307099136.0, - "3210": 33306869760.0, - "3215": 33307678720.0, - "3220": 33307111424.0, - "3225": 33307146240.0, - "3230": 33306972160.0, - "3235": 33307387904.0, - "3240": 33307521024.0, - "3245": 33307287552.0, - "3250": 33307523072.0, - "3255": 33307639808.0, - "3260": 33307092992.0, - "3265": 33308338176.0, - "3270": 33307273216.0, - "3275": 33307713536.0, - "3280": 33307719680.0, - "3285": 33308049408.0, - "3290": 33307484160.0, - "3295": 33307594752.0, - "3300": 33307228160.0, - "3305": 33306580992.0, - "3310": 33307541504.0, - "3315": 33307211776.0, - "3320": 33307324416.0, - "3325": 33306615808.0, - "3330": 33307777024.0, - "3335": 33308135424.0, - "3340": 33307351040.0, - "3345": 33307131904.0, - "3350": 33307031552.0, - "3355": 33307791360.0, - "3360": 33307410432.0, - "3365": 33307090944.0, - "3370": 33306187776.0, - "3375": 33307113472.0, - "3380": 33308071936.0, - "3385": 33307717632.0, - "3390": 33306648576.0, - "3395": 33306781696.0, - "3400": 33307734016.0, - "3405": 33307570176.0, - "3410": 33307750400.0, - "3415": 33307920384.0, - "3420": 33308157952.0, - "3425": 33307500544.0, - "3430": 33307168768.0, - "3435": 33307645952.0, - "3440": 33307185152.0, - "3445": 33307459584.0, - "3450": 33306804224.0, - "3455": 33307662336.0, - "3460": 33306748928.0, - "3465": 33306497024.0, - "3470": 33306796032.0, - "3475": 33307947008.0, - "3480": 33308039168.0, - "3485": 33307676672.0, - "3490": 33306728448.0, - "3495": 33307115520.0, - "3500": 33306628096.0, - "3505": 33307537408.0, - "3510": 33306945536.0, - "3515": 33306902528.0, - "3520": 33307553792.0, - "3525": 33307590656.0, - "3530": 33307852800.0, - "3535": 33306773504.0, - "3540": 33307953152.0, - "3545": 33307463680.0, - "3550": 33307123712.0, - "3555": 33307738112.0, - "3560": 33307766784.0, - "3565": 33307088896.0, - "3570": 33306882048.0, - "3575": 33307443200.0, - "3580": 33306951680.0, - "3585": 33306841088.0, - "3590": 33308293120.0, - "3595": 33307723776.0, - "3600": 33307756544.0, - "3605": 33307930624.0, - "3610": 33307985920.0, - "3615": 33307222016.0, - "3620": 33307430912.0, - "3625": 33307148288.0, - "3630": 33306388480.0, - "3635": 33307035648.0, - "3640": 33307455488.0, - "3645": 33306906624.0, - "3650": 33307545600.0, - "3655": 33307336704.0, - "3660": 33306910720.0, - "3665": 33307623424.0, - "3670": 33306824704.0, - "3675": 33307590656.0, - "3680": 33307373568.0, - "3685": 33306505216.0, - "3690": 33307817984.0, - "3695": 33306890240.0, - "3700": 33306802176.0, - "3705": 33306945536.0, - "3710": 33306904576.0, - "3715": 33307754496.0, - "3720": 33308395520.0, - "3725": 33308112896.0, - "3730": 33307652096.0, - "3735": 33307867136.0, - "3740": 33307805696.0, - "3745": 33308069888.0, - "3750": 33307826176.0, - "3755": 33306439680.0, - "3760": 33306849280.0, - "3765": 33307471872.0, - "3770": 33307095040.0, - "3775": 33307492352.0, - "3780": 33308141568.0, - "3785": 33307910144.0, - "3790": 33307656192.0, - "3795": 33307727872.0, - "3800": 33307246592.0, - "3805": 33307848704.0, - "3810": 33307490304.0, - "3815": 33307357184.0, - "3820": 33307346944.0, - "3825": 33307619328.0, - "3830": 33308102656.0, - "3835": 33306849280.0, - "3840": 33307678720.0, - "3845": 33307258880.0, - "3850": 33307686912.0, - "3855": 33307467776.0, - "3860": 33307471872.0, - "3865": 33307439104.0, - "3870": 33307676672.0, - "3875": 33306865664.0, - "3880": 33307232256.0, - "3885": 33307099136.0, - "3890": 33307854848.0, - "3895": 33306370048.0, - "3900": 33306900480.0, - "3905": 33306824704.0, - "3910": 33307361280.0, - "3915": 33306591232.0, - "3920": 33307213824.0, - "3925": 33306980352.0, - "3930": 33308110848.0, - "3935": 33307179008.0, - "3940": 33307379712.0, - "3945": 33307813888.0, - "3950": 33307277312.0, - "3955": 33307203584.0, - "3960": 33307234304.0, - "3965": 33307121664.0, - "3970": 33307303936.0, - "3975": 33307144192.0, - "3980": 33307869184.0, - "3985": 33307660288.0, - "3990": 33307779072.0, - "3995": 33307795456.0, - "4000": 33307131904.0, - "4005": 33307238400.0, - "4010": 33307875328.0, - "4015": 33306726400.0, - "4020": 33308227584.0, - "4025": 33307799552.0, - "4030": 33307318272.0, - "4035": 33308190720.0, - "4040": 33307932672.0, - "4045": 33307291648.0, - "4050": 33307959296.0, - "4055": 33307447296.0, - "4060": 33307486208.0, - "4065": 33308088320.0, - "4070": 33307183104.0, - "4075": 33307201536.0, - "4080": 33308184576.0, - "4085": 33306406912.0, - "4090": 33307891712.0, - "4095": 33307031552.0, - "4100": 33308100608.0, - "4105": 33307258880.0, - "4110": 33307492352.0, - "4115": 33308344320.0, - "4120": 33306552320.0, - "4125": 33307611136.0, - "4130": 33306083328.0, - "4135": 33308463104.0, - "4140": 33307611136.0, - "4145": 33307455488.0, - "4150": 33307658240.0, - "4155": 33307133952.0, - "4160": 33308233728.0, - "4165": 33307408384.0, - "4170": 33306888192.0, - "4175": 33307852800.0, - "4180": 33307150336.0, - "4185": 33307127808.0, - "4190": 33307582464.0, - "4195": 33308610560.0, - "4200": 33308231680.0, - "4205": 33307906048.0, - "4210": 33308307456.0, - "4215": 33306363904.0, - "4220": 33306980352.0, - "4225": 33306318848.0, - "4230": 33307731968.0, - "4235": 33307142144.0, - "4240": 33307432960.0, - "4245": 33307097088.0, - "4250": 33307783168.0, - "4255": 33307365376.0, - "4260": 33306947584.0, - "4265": 33306611712.0, - "4270": 33306347520.0, - "4275": 33306624000.0, - "4280": 33307185152.0, - "4285": 33307922432.0, - "4290": 33307508736.0, - "4295": 33307658240.0, - "4300": 33308405760.0, - "4305": 33306474496.0, - "4310": 33307557888.0, - "4315": 33308307456.0, - "4320": 33307719680.0, - "4325": 33306824704.0, - "4330": 33307594752.0, - "4335": 33306144768.0, - "4340": 33307852800.0, - "4345": 33307342848.0, - "4350": 33308139520.0, - "4355": 33307713536.0, - "4360": 33307373568.0, - "4365": 33308065792.0, - "4370": 33306681344.0, - "4375": 33307770880.0, - "4380": 33307361280.0, - "4385": 33307086848.0, - "4390": 33307019264.0, - "4395": 33306986496.0, - "4400": 33307103232.0, - "4405": 33307664384.0, - "4410": 33307996160.0, - "4415": 33306990592.0, - "4420": 33306546176.0, - "4425": 33306904576.0, - "4430": 33307303936.0, - "4435": 33306763264.0, - "4440": 33308063744.0, - "4445": 33307242496.0, - "4450": 33307283456.0, - "4455": 33306654720.0, - "4460": 33307205632.0, - "4465": 33306867712.0, - "4470": 33307916288.0, - "4475": 33307791360.0, - "4480": 33308450816.0, - "4485": 33307547648.0, - "4490": 33307090944.0, - "4495": 33307000832.0, - "4500": 33306935296.0, - "4505": 33307099136.0, - "4510": 33307525120.0, - "4515": 33307367424.0, - "4520": 33307813888.0, - "4525": 33307715584.0, - "4530": 33307901952.0, - "4535": 33307174912.0, - "4540": 33306880000.0, - "4545": 33307138048.0, - "4550": 33306873856.0, - "4555": 33306316800.0, - "4560": 33305849856.0, - "4565": 33307187200.0, - "4570": 33307260928.0, - "4575": 33307410432.0, - "4580": 33307201536.0, - "4585": 33306920960.0, - "4590": 33307355136.0, - "4595": 33307346944.0, - "4600": 33307856896.0, - "4605": 33307752448.0, - "4610": 33307095040.0, - "4615": 33306286080.0, - "4620": 33306699776.0, - "4625": 33308069888.0, - "4630": 33307439104.0, - "4635": 33306900480.0, - "4640": 33307076608.0, - "4645": 33308160000.0, - "4650": 33307758592.0, - "4655": 33307865088.0, - "4660": 33306255360.0, - "4665": 33307641856.0, - "4670": 33307912192.0, - "4675": 33306603520.0, - "4680": 33307799552.0, - "4685": 33307488256.0, - "4690": 33307394048.0, - "4695": 33306763264.0, - "4700": 33307873280.0, - "4705": 33308106752.0, - "4710": 33307617280.0, - "4715": 33307047936.0, - "4720": 33307901952.0, - "4725": 33307793408.0, - "4730": 33308123136.0, - "4735": 33307451392.0, - "4740": 33307623424.0, - "4745": 33306857472.0, - "4750": 33308436480.0, - "4755": 33307260928.0, - "4760": 33307975680.0, - "4765": 33307965440.0, - "4770": 33306859520.0, - "4775": 33307922432.0, - "4780": 33306978304.0, - "4785": 33306869760.0, - "4790": 33307084800.0, - "4795": 33307226112.0, - "4800": 33307961344.0, - "4805": 33308334080.0, - "4810": 33305587712.0, - "4815": 33307928576.0, - "4820": 33307875328.0, - "4825": 33306957824.0, - "4830": 33307797504.0, - "4835": 33306116096.0, - "4840": 33307654144.0, - "4845": 33307131904.0, - "4850": 33308055552.0, - "4855": 33305792512.0, - "4860": 33307402240.0, - "4865": 33307086848.0, - "4870": 33307637760.0, - "4875": 33307789312.0, - "4880": 33307701248.0, - "4885": 33308010496.0, - "4890": 33307039744.0, - "4895": 33307369472.0, - "4900": 33307127808.0, - "4905": 33306988544.0, - "4910": 33308276736.0, - "4915": 33307090944.0, - "4920": 33307015168.0, - "4925": 33308043264.0, - "4930": 33307607040.0, - "4935": 33308209152.0, - "4940": 33307725824.0, - "4945": 33307985920.0, - "4950": 33307582464.0, - "4955": 33307297792.0, - "4960": 33307639808.0, - "4965": 33307445248.0, - "4970": 33306869760.0, - "4975": 33306787840.0, - "4980": 33307099136.0, - "4985": 33307635712.0, - "4990": 33307406336.0, - "4995": 33307471872.0, - "5000": 33307375616.0, - "5005": 33307672576.0, - "5010": 33306970112.0, - "5015": 33307244544.0, - "5020": 33306966016.0, - "5025": 33307705344.0, - "5030": 33307463680.0, - "5035": 33306818560.0, - "5040": 33306972160.0, - "5045": 33308157952.0, - "5050": 33306376192.0, - "5055": 33307594752.0, - "5060": 33308471296.0, - "5065": 33307455488.0, - "5070": 33307301888.0, - "5075": 33307488256.0, - "5080": 33307910144.0, - "5085": 33307635712.0, - "5090": 33307406336.0, - "5095": 33307254784.0, - "5100": 33306828800.0, - "5105": 33307852800.0, - "5110": 33308258304.0, - "5115": 33307228160.0, - "5120": 33307955200.0, - "5125": 33305640960.0, - "5130": 33306683392.0, - "5135": 33307336704.0, - "5140": 33307834368.0, - "5145": 33307060224.0, - "5150": 33307023360.0, - "5155": 33307308032.0, - "5160": 33306664960.0, - "5165": 33307123712.0, - "5170": 33306935296.0, - "5175": 33308094464.0, - "5180": 33306566656.0, - "5185": 33306796032.0, - "5190": 33307545600.0, - "5195": 33308067840.0, - "5200": 33307754496.0, - "5205": 33307445248.0, - "5210": 33306785792.0, - "5215": 33307551744.0, - "5220": 33308188672.0, - "5225": 33307338752.0, - "5230": 33307283456.0, - "5235": 33306976256.0, - "5240": 33308041216.0, - "5245": 33308340224.0, - "5250": 33308153856.0, - "5255": 33307590656.0, - "5260": 33306896384.0, - "5265": 33308303360.0, - "5270": 33308796928.0, - "5275": 33307949056.0, - "5280": 33306157056.0, - "5285": 33307904000.0, - "5290": 33308143616.0, - "5295": 33306533888.0, - "5300": 33307912192.0, - "5305": 33308338176.0, - "5310": 33308688384.0, - "5315": 33308045312.0, - "5320": 33306206208.0, - "5325": 33308219392.0, - "5330": 33308012544.0, - "5335": 33307602944.0, - "5340": 33306685440.0, - "5345": 33308209152.0, - "5350": 33307150336.0, - "5355": 33308176384.0, - "5360": 33307273216.0, - "5365": 33307850752.0, - "5370": 33307222016.0, - "5375": 33307803648.0, - "5380": 33307617280.0, - "5385": 33307179008.0, - "5390": 33307389952.0, - "5395": 33306927104.0, - "5400": 33307518976.0, - "5405": 33307400192.0, - "5410": 33307598848.0, - "5415": 33307846656.0, - "5420": 33307490304.0, - "5425": 33307459584.0, - "5430": 33307283456.0, - "5435": 33307453440.0, - "5440": 33307383808.0, - "5445": 33307117568.0, - "5450": 33307832320.0, - "5455": 33307582464.0, - "5460": 33306963968.0, - "5465": 33306947584.0, - "5470": 33307355136.0, - "5475": 33306748928.0, - "5480": 33306435584.0, - "5485": 33307590656.0, - "5490": 33307787264.0, - "5495": 33307568128.0, - "5500": 33307351040.0, - "5505": 33307568128.0, - "5510": 33307426816.0, - "5515": 33307451392.0, - "5520": 33307549696.0, - "5525": 33307000832.0, - "5530": 33307566080.0, - "5535": 33307664384.0, - "5540": 33306966016.0, - "5545": 33307781120.0, - "5550": 33307275264.0, - "5555": 33307269120.0, - "5560": 33307576320.0, - "5565": 33307377664.0, - "5570": 33307052032.0, - "5575": 33306978304.0, - "5580": 33307965440.0, - "5585": 33307494400.0, - "5590": 33308055552.0, - "5595": 33306943488.0, - "5600": 33306542080.0, - "5605": 33307680768.0, - "5610": 33308542976.0, - "5615": 33307826176.0, - "5620": 33308108800.0, - "5625": 33308225536.0, - "5630": 33308069888.0, - "5635": 33307760640.0, - "5640": 33307500544.0, - "5645": 33307930624.0, - "5650": 33306755072.0, - "5655": 33308192768.0, - "5660": 33308631040.0, - "5665": 33307418624.0, - "5670": 33307504640.0, - "5675": 33307715584.0, - "5680": 33307910144.0, - "5685": 33307996160.0, - "5690": 33307478016.0, - "5695": 33308164096.0, - "5700": 33307906048.0, - "5705": 33307750400.0, - "5710": 33306779648.0, - "5715": 33307219968.0, - "5720": 33307750400.0, - "5725": 33307537408.0, - "5730": 33307262976.0, - "5735": 33306767360.0, - "5740": 33307508736.0, - "5745": 33306753024.0, - "5750": 33306636288.0, - "5755": 33306943488.0, - "5760": 33307553792.0, - "5765": 33307842560.0, - "5770": 33307047936.0, - "5775": 33307348992.0, - "5780": 33306361856.0, - "5785": 33307709440.0, - "5790": 33307832320.0, - "5795": 33307406336.0, - "5800": 33307056128.0, - "5805": 33307631616.0, - "5810": 33307766784.0, - "5815": 33307971584.0, - "5820": 33307447296.0, - "5825": 33307084800.0, - "5830": 33307324416.0, - "5835": 33307127808.0, - "5840": 33307729920.0, - "5845": 33307088896.0, - "5850": 33307635712.0, - "5855": 33307119616.0, - "5860": 33306703872.0, - "5865": 33307291648.0, - "5870": 33307613184.0, - "5875": 33307893760.0, - "5880": 33307893760.0, - "5885": 33307301888.0, - "5890": 33307830272.0, - "5895": 33306671104.0, - "5900": 33306488832.0, - "5905": 33308141568.0, - "5910": 33307373568.0, - "5915": 33307330560.0, - "5920": 33307656192.0, - "5925": 33307533312.0, - "5930": 33307848704.0, - "5935": 33307586560.0, - "5940": 33307602944.0, - "5945": 33307631616.0, - "5950": 33306615808.0, - "5955": 33307719680.0, - "5960": 33308553216.0, - "5965": 33308676096.0, - "5970": 33308313600.0, - "5975": 33306810368.0, - "5980": 33307222016.0, - "5985": 33307367424.0, - "5990": 33307119616.0, - "5995": 33307166720.0, - "6000": 33307822080.0, - "6005": 33307553792.0, - "6010": 33307756544.0, - "6015": 33306392576.0, - "6020": 33308116992.0, - "6025": 33307738112.0, - "6030": 33307459584.0, - "6035": 33306920960.0, - "6040": 33307701248.0, - "6045": 33307932672.0, - "6050": 33307496448.0, - "6055": 33307133952.0, - "6060": 33306370048.0, - "6065": 33307521024.0, - "6070": 33307244544.0, - "6075": 33306447872.0, - "6080": 33306963968.0, - "6085": 33307932672.0, - "6090": 33307293696.0, - "6095": 33307058176.0, - "6100": 33307449344.0, - "6105": 33307613184.0, - "6110": 33307779072.0, - "6115": 33306832896.0, - "6120": 33306732544.0, - "6125": 33306488832.0, - "6130": 33308866560.0, - "6135": 33308000256.0, - "6140": 33307906048.0, - "6145": 33308504064.0, - "6150": 33307826176.0, - "6155": 33306906624.0, - "6160": 33307533312.0, - "6165": 33307578368.0, - "6170": 33307891712.0, - "6175": 33307537408.0, - "6180": 33307803648.0, - "6185": 33308125184.0, - "6190": 33307342848.0, - "6195": 33308135424.0, - "6200": 33306468352.0, - "6205": 33308026880.0, - "6210": 33308028928.0, - "6215": 33308157952.0, - "6220": 33307662336.0, - "6225": 33307344896.0, - "6230": 33308231680.0, - "6235": 33307148288.0, - "6240": 33308809216.0, - "6245": 33307017216.0, - "6250": 33307234304.0, - "6255": 33308430336.0, - "6260": 33307246592.0, - "6265": 33307418624.0, - "6270": 33308319744.0, - "6275": 33307090944.0, - "6280": 33307404288.0, - "6285": 33308227584.0, - "6290": 33307656192.0, - "6295": 33306865664.0, - "6300": 33307596800.0, - "6305": 33308192768.0, - "6310": 33307695104.0, - "6315": 33307361280.0, - "6320": 33306775552.0, - "6325": 33307557888.0, - "6330": 33307639808.0, - "6335": 33307820032.0, - "6340": 33307410432.0, - "6345": 33307410432.0, - "6350": 33308256256.0, - "6355": 33307082752.0, - "6360": 33306855424.0, - "6365": 33307418624.0, - "6370": 33307066368.0, - "6375": 33307891712.0, - "6380": 33307779072.0, - "6385": 33306128384.0, - "6390": 33306884096.0, - "6395": 33307060224.0, - "6400": 33307250688.0, - "6405": 33308135424.0, - "6410": 33308155904.0, - "6415": 33307101184.0, - "6420": 33306318848.0, - "6425": 33308065792.0, - "6430": 33307813888.0, - "6435": 33307842560.0, - "6440": 33308571648.0, - "6445": 33306138624.0, - "6450": 33307762688.0, - "6455": 33308119040.0, - "6460": 33308037120.0, - "6465": 33308467200.0, - "6470": 33307181056.0, - "6475": 33307246592.0, - "6480": 33306855424.0, - "6485": 33308440576.0, - "6490": 33307863040.0, - "6495": 33306857472.0, - "6500": 33306529792.0, - "6505": 33307097088.0, - "6510": 33307842560.0, - "6515": 33307095040.0, - "6520": 33307848704.0, - "6525": 33307596800.0, - "6530": 33307117568.0, - "6535": 33307811840.0, - "6540": 33307645952.0, - "6545": 33307211776.0, - "6550": 33308196864.0, - "6555": 33307213824.0, - "6560": 33307326464.0, - "6565": 33306490880.0, - "6570": 33306877952.0, - "6575": 33307199488.0, - "6580": 33308370944.0, - "6585": 33307828224.0, - "6590": 33307871232.0, - "6595": 33307590656.0, - "6600": 33306578944.0, - "6605": 33307496448.0, - "6610": 33307912192.0, - "6615": 33307521024.0, - "6620": 33307189248.0, - "6625": 33306961920.0, - "6630": 33306800128.0, - "6635": 33306957824.0, - "6640": 33307762688.0, - "6645": 33306427392.0, - "6650": 33307672576.0, - "6655": 33305133056.0, - "6660": 33307598848.0, - "6665": 33306884096.0, - "6670": 33307500544.0, - "6675": 33307592704.0, - "6680": 33306923008.0, - "6685": 33307084800.0, - "6690": 33307402240.0, - "6695": 33307963392.0, - "6700": 33307336704.0, - "6705": 33306845184.0, - "6710": 33307230208.0, - "6715": 33306310656.0, - "6720": 33307834368.0, - "6725": 33308094464.0, - "6730": 33308327936.0, - "6735": 33308092416.0, - "6740": 33306873856.0, - "6745": 33308082176.0, - "6750": 33306112000.0, - "6755": 33306810368.0, - "6760": 33307394048.0, - "6765": 33307414528.0, - "6770": 33308286976.0, - "6775": 33308618752.0, - "6780": 33306904576.0, - "6785": 33308182528.0, - "6790": 33308057600.0, - "6795": 33307049984.0, - "6800": 33306744832.0, - "6805": 33307242496.0, - "6810": 33307176960.0, - "6815": 33307779072.0, - "6820": 33306849280.0, - "6825": 33307623424.0, - "6830": 33307887616.0, - "6835": 33307670528.0, - "6840": 33308348416.0, - "6845": 33308184576.0, - "6850": 33307727872.0, - "6855": 33307252736.0, - "6860": 33307680768.0, - "6865": 33306963968.0, - "6870": 33307099136.0, - "6875": 33307037696.0, - "6880": 33307635712.0, - "6885": 33307615232.0, - "6890": 33307652096.0, - "6895": 33307369472.0, - "6900": 33307947008.0, - "6905": 33307334656.0, - "6910": 33306824704.0, - "6915": 33307537408.0, - "6920": 33306619904.0, - "6925": 33306408960.0, - "6930": 33306765312.0, - "6935": 33306609664.0, - "6940": 33307623424.0, - "6945": 33307160576.0, - "6950": 33307463680.0, - "6955": 33306507264.0, - "6960": 33307185152.0, - "6965": 33307019264.0, - "6970": 33307598848.0, - "6975": 33307435008.0, - "6980": 33307238400.0, - "6985": 33306222592.0, - "6990": 33308581888.0, - "6995": 33307254784.0, - "7000": 33308035072.0, - "7005": 33308233728.0, - "7010": 33307092992.0, - "7015": 33307193344.0, - "7020": 33307643904.0, - "7025": 33308274688.0, - "7030": 33307019264.0, - "7035": 33308454912.0, - "7040": 33308086272.0, - "7045": 33307277312.0, - "7050": 33307172864.0, - "7055": 33306599424.0, - "7060": 33307613184.0, - "7065": 33307031552.0, - "7070": 33306243072.0, - "7075": 33308037120.0, - "7080": 33306759168.0, - "7085": 33308033024.0, - "7090": 33307971584.0, - "7095": 33306873856.0, - "7100": 33308522496.0, - "7105": 33307363328.0, - "7110": 33308063744.0, - "7115": 33307770880.0, - "7120": 33307906048.0, - "7125": 33307443200.0, - "7130": 33307574272.0, - "7135": 33307541504.0, - "7140": 33306765312.0, - "7145": 33307854848.0, - "7150": 33306853376.0, - "7155": 33307856896.0, - "7160": 33307906048.0, - "7165": 33308184576.0, - "7170": 33308272640.0, - "7175": 33306417152.0, - "7180": 33307107328.0, - "7185": 33307860992.0, - "7190": 33307078656.0, - "7195": 33307494400.0, - "7200": 33307613184.0, - "7205": 33307680768.0, - "7210": 33307990016.0, - "7215": 33306822656.0, - "7220": 33306730496.0, - "7225": 33307539456.0, - "7230": 33307744256.0, - "7235": 33306136576.0, - "7240": 33307189248.0, - "7245": 33307236352.0, - "7250": 33306980352.0, - "7255": 33307832320.0, - "7260": 33307426816.0, - "7265": 33307340800.0, - "7270": 33307844608.0, - "7275": 33308094464.0, - "7280": 33308602368.0, - "7285": 33307498496.0, - "7290": 33307920384.0, - "7295": 33307426816.0, - "7300": 33306392576.0, - "7305": 33306718208.0, - "7310": 33307260928.0, - "7315": 33307527168.0, - "7320": 33306963968.0, - "7325": 33308188672.0, - "7330": 33307799552.0, - "7335": 33307717632.0, - "7340": 33307238400.0, - "7345": 33307365376.0, - "7350": 33307314176.0, - "7355": 33307940864.0, - "7360": 33306284032.0, - "7365": 33307893760.0, - "7370": 33306275840.0, - "7375": 33307873280.0, - "7380": 33309245440.0, - "7385": 33306730496.0, - "7390": 33307758592.0, - "7395": 33306609664.0, - "7400": 33307652096.0, - "7405": 33306427392.0, - "7410": 33308524544.0, - "7415": 33307961344.0, - "7420": 33307242496.0, - "7425": 33307811840.0, - "7430": 33307119616.0, - "7435": 33307428864.0, - "7440": 33307709440.0, - "7445": 33308342272.0, - "7450": 33306980352.0, - "7455": 33307351040.0, - "7460": 33306730496.0, - "7465": 33306537984.0, - "7470": 33307664384.0, - "7475": 33308037120.0, - "7480": 33307179008.0, - "7485": 33308467200.0, - "7490": 33307822080.0, - "7495": 33306638336.0, - "7500": 33306689536.0, - "7505": 33307717632.0, - "7510": 33306789888.0, - "7515": 33307518976.0, - "7520": 33307260928.0, - "7525": 33307676672.0, - "7530": 33306916864.0, - "7535": 33306996736.0, - "7540": 33306566656.0, - "7545": 33306720256.0, - "7550": 33307584512.0, - "7555": 33307471872.0, - "7560": 33306736640.0, - "7565": 33306292224.0, - "7570": 33307066368.0, - "7575": 33306871808.0, - "7580": 33307324416.0, - "7585": 33307115520.0, - "7590": 33306341376.0, - "7595": 33307744256.0, - "7600": 33307482112.0, - "7605": 33308149760.0, - "7610": 33307525120.0, - "7615": 33307656192.0, - "7620": 33307224064.0, - "7625": 33307158528.0, - "7630": 33307742208.0, - "7635": 33308012544.0, - "7640": 33307049984.0, - "7645": 33308631040.0, - "7650": 33307865088.0, - "7655": 33308229632.0, - "7660": 33307043840.0, - "7665": 33307037696.0, - "7670": 33306791936.0, - "7675": 33307320320.0, - "7680": 33307293696.0, - "7685": 33307432960.0, - "7690": 33307103232.0, - "7695": 33307568128.0, - "7700": 33306312704.0, - "7705": 33307795456.0, - "7710": 33307996160.0, - "7715": 33307133952.0, - "7720": 33308164096.0, - "7725": 33307254784.0, - "7730": 33307830272.0, - "7735": 33307721728.0, - "7740": 33307492352.0, - "7745": 33307783168.0, - "7750": 33306728448.0, - "7755": 33307734016.0, - "7760": 33308614656.0, - "7765": 33306791936.0, - "7770": 33308278784.0, - "7775": 33307873280.0, - "7780": 33307078656.0, - "7785": 33306990592.0, - "7790": 33307062272.0, - "7795": 33307680768.0, - "7800": 33306982400.0, - "7805": 33308090368.0, - "7810": 33307308032.0, - "7815": 33307078656.0, - "7820": 33307951104.0, - "7825": 33306480640.0, - "7830": 33307258880.0, - "7835": 33307891712.0, - "7840": 33307432960.0, - "7845": 33307066368.0, - "7850": 33306910720.0, - "7855": 33307938816.0, - "7860": 33307308032.0, - "7865": 33308264448.0, - "7870": 33307729920.0, - "7875": 33308129280.0, - "7880": 33308352512.0, - "7885": 33307398144.0, - "7890": 33306920960.0, - "7895": 33307156480.0, - "7900": 33308221440.0, - "7905": 33308047360.0, - "7910": 33306146816.0, - "7915": 33306910720.0, - "7920": 33307090944.0, - "7925": 33308264448.0, - "7930": 33307908096.0, - "7935": 33307465728.0, - "7940": 33307375616.0, - "7945": 33307848704.0, - "7950": 33308090368.0, - "7955": 33307043840.0, - "7960": 33307168768.0, - "7965": 33307846656.0, - "7970": 33306454016.0, - "7975": 33307635712.0, - "7980": 33307555840.0, - "7985": 33307131904.0, - "7990": 33306732544.0, - "7995": 33307430912.0, - "8000": 33307674624.0, - "8005": 33307746304.0, - "8010": 33308002304.0, - "8015": 33306906624.0, - "8020": 33307895808.0, - "8025": 33308231680.0, - "8030": 33307664384.0, - "8035": 33306888192.0, - "8040": 33308024832.0, - "8045": 33307693056.0, - "8050": 33306583040.0, - "8055": 33307201536.0, - "8060": 33307594752.0, - "8065": 33308260352.0, - "8070": 33307426816.0, - "8075": 33308108800.0, - "8080": 33308178432.0, - "8085": 33307308032.0, - "8090": 33306513408.0, - "8095": 33306968064.0, - "8100": 33308413952.0, - "8105": 33308241920.0, - "8110": 33307471872.0, - "8115": 33307832320.0, - "8120": 33307193344.0, - "8125": 33307295744.0, - "8130": 33306775552.0, - "8135": 33307097088.0, - "8140": 33307865088.0, - "8145": 33306746880.0, - "8150": 33307023360.0, - "8155": 33306806272.0, - "8160": 33307373568.0, - "8165": 33307631616.0, - "8170": 33306769408.0, - "8175": 33308239872.0, - "8180": 33307240448.0, - "8185": 33307471872.0, - "8190": 33308184576.0, - "8195": 33307754496.0, - "8200": 33307459584.0, - "8205": 33307850752.0, - "8210": 33306810368.0, - "8215": 33306222592.0, - "8220": 33307795456.0, - "8225": 33308078080.0, - "8230": 33306132480.0, - "8235": 33308764160.0, - "8240": 33307432960.0, - "8245": 33307867136.0, - "8250": 33308260352.0, - "8255": 33308334080.0, - "8260": 33308233728.0, - "8265": 33308528640.0, - "8270": 33307699200.0, - "8275": 33306748928.0, - "8280": 33307635712.0, - "8285": 33308008448.0, - "8290": 33307590656.0, - "8295": 33308041216.0, - "8300": 33307516928.0, - "8305": 33307879424.0, - "8310": 33307576320.0, - "8315": 33308366848.0, - "8320": 33307496448.0, - "8325": 33307256832.0, - "8330": 33307680768.0, - "8335": 33306669056.0, - "8340": 33306990592.0, - "8345": 33307936768.0, - "8350": 33307955200.0, - "8355": 33307791360.0, - "8360": 33306640384.0, - "8365": 33307586560.0, - "8370": 33307648000.0, - "8375": 33306890240.0, - "8380": 33307764736.0, - "8385": 33307871232.0, - "8390": 33307023360.0, - "8395": 33307664384.0, - "8400": 33307510784.0, - "8405": 33307338752.0, - "8410": 33307316224.0, - "8415": 33307566080.0, - "8420": 33307891712.0, - "8425": 33307676672.0, - "8430": 33307693056.0, - "8435": 33306812416.0, - "8440": 33307762688.0, - "8445": 33307447296.0, - "8450": 33307426816.0, - "8455": 33306660864.0, - "8460": 33307385856.0, - "8465": 33308121088.0, - "8470": 33307664384.0, - "8475": 33307023360.0, - "8480": 33308082176.0, - "8485": 33307346944.0, - "8490": 33307471872.0, - "8495": 33307889664.0, - "8500": 33307492352.0, - "8505": 33307502592.0, - "8510": 33307815936.0, - "8515": 33307983872.0, - "8520": 33306431488.0, - "8525": 33306537984.0, - "8530": 33307199488.0, - "8535": 33307848704.0, - "8540": 33307459584.0, - "8545": 33307432960.0, - "8550": 33307600896.0, - "8555": 33308553216.0, - "8560": 33307701248.0, - "8565": 33307799552.0, - "8570": 33307934720.0, - "8575": 33306324992.0, - "8580": 33307648000.0, - "8585": 33307951104.0, - "8590": 33308108800.0, - "8595": 33308037120.0, - "8600": 33308182528.0, - "8605": 33307410432.0, - "8610": 33308102656.0, - "8615": 33307342848.0, - "8620": 33306077184.0, - "8625": 33308153856.0, - "8630": 33307807744.0, - "8635": 33306734592.0, - "8640": 33307867136.0, - "8645": 33307129856.0, - "8650": 33307430912.0, - "8655": 33307545600.0, - "8660": 33307975680.0, - "8665": 33307822080.0, - "8670": 33307156480.0, - "8675": 33307758592.0, - "8680": 33308340224.0, - "8685": 33307357184.0, - "8690": 33308479488.0, - "8695": 33306523648.0, - "8700": 33307404288.0, - "8705": 33307791360.0, - "8710": 33308004352.0, - "8715": 33308108800.0, - "8720": 33307424768.0, - "8725": 33307564032.0, - "8730": 33306877952.0, - "8735": 33307199488.0, - "8740": 33307734016.0, - "8745": 33307248640.0, - "8750": 33307912192.0, - "8755": 33307215872.0, - "8760": 33308012544.0, - "8765": 33306640384.0, - "8770": 33307977728.0, - "8775": 33306624000.0, - "8780": 33307357184.0, - "8785": 33306353664.0, - "8790": 33307518976.0, - "8795": 33308178432.0, - "8800": 33307113472.0, - "8805": 33307045888.0, - "8810": 33307252736.0, - "8815": 33307430912.0, - "8820": 33307568128.0, - "8825": 33306791936.0, - "8830": 33307529216.0, - "8835": 33306691584.0, - "8840": 33306529792.0, - "8845": 33307303936.0, - "8850": 33307901952.0, - "8855": 33308196864.0, - "8860": 33307965440.0, - "8865": 33307971584.0, - "8870": 33306595328.0, - "8875": 33306419200.0, - "8880": 33307508736.0, - "8885": 33306345472.0, - "8890": 33307373568.0, - "8895": 33307631616.0, - "8900": 33307330560.0, - "8905": 33308209152.0, - "8910": 33308155904.0, - "8915": 33306943488.0, - "8920": 33307381760.0, - "8925": 33307437056.0, - "8930": 33308041216.0, - "8935": 33307142144.0, - "8940": 33307768832.0, - "8945": 33308551168.0, - "8950": 33307682816.0, - "8955": 33307656192.0, - "8960": 33307787264.0, - "8965": 33306220544.0, - "8970": 33307693056.0, - "8975": 33307529216.0, - "8980": 33307027456.0, - "8985": 33308442624.0, - "8990": 33307588608.0, - "8995": 33308315648.0, - "9000": 33307787264.0, - "9005": 33307951104.0, - "9010": 33305649152.0, - "9015": 33307592704.0, - "9020": 33307033600.0, - "9025": 33307232256.0, - "9030": 33307793408.0, - "9035": 33307385856.0, - "9040": 33308012544.0, - "9045": 33307287552.0, - "9050": 33307701248.0, - "9055": 33306814464.0, - "9060": 33307975680.0, - "9065": 33307693056.0, - "9070": 33306888192.0, - "9075": 33307168768.0, - "9080": 33306818560.0, - "9085": 33307557888.0, - "9090": 33308200960.0, - "9095": 33306867712.0, - "9100": 33308563456.0, - "9105": 33306994688.0, - "9110": 33307004928.0, - "9115": 33307439104.0, - "9120": 33307340800.0, - "9125": 33307295744.0, - "9130": 33306771456.0, - "9135": 33307031552.0, - "9140": 33306497024.0, - "9145": 33307629568.0, - "9150": 33308002304.0, - "9155": 33307484160.0, - "9160": 33308100608.0, - "9165": 33307611136.0, - "9170": 33307897856.0, - "9175": 33307473920.0, - "9180": 33307977728.0, - "9185": 33307203584.0, - "9190": 33306693632.0, - "9195": 33306931200.0, - "9200": 33307779072.0, - "9205": 33307205632.0, - "9210": 33307637760.0, - "9215": 33307090944.0, - "9220": 33308454912.0, - "9225": 33307471872.0, - "9230": 33307322368.0, - "9235": 33307422720.0, - "9240": 33307242496.0, - "9245": 33308026880.0, - "9250": 33308203008.0, - "9255": 33307389952.0, - "9260": 33308825600.0, - "9265": 33306505216.0, - "9270": 33307426816.0, - "9275": 33307865088.0, - "9280": 33307435008.0, - "9285": 33307258880.0, - "9290": 33308000256.0, - "9295": 33307498496.0, - "9300": 33307301888.0, - "9305": 33307674624.0, - "9310": 33307031552.0, - "9315": 33306327040.0, - "9320": 33306834944.0, - "9325": 33307971584.0, - "9330": 33307910144.0, - "9335": 33307213824.0, - "9340": 33307385856.0, - "9345": 33307385856.0, - "9350": 33308127232.0, - "9355": 33306615808.0, - "9360": 33306697728.0, - "9365": 33307463680.0, - "9370": 33306355712.0, - "9375": 33307219968.0, - "9380": 33307224064.0, - "9385": 33308024832.0, - "9390": 33307830272.0, - "9395": 33307535360.0, - "9400": 33307031552.0, - "9405": 33307418624.0, - "9410": 33306822656.0, - "9415": 33307267072.0, - "9420": 33306994688.0, - "9425": 33306892288.0, - "9430": 33307199488.0, - "9435": 33306980352.0, - "9440": 33306451968.0, - "9445": 33308420096.0, - "9450": 33306755072.0, - "9455": 33306341376.0, - "9460": 33308131328.0, - "9465": 33307023360.0, - "9470": 33308307456.0, - "9475": 33308221440.0, - "9480": 33308037120.0, - "9485": 33308055552.0, - "9490": 33307908096.0, - "9495": 33306486784.0, - "9500": 33306490880.0, - "9505": 33307967488.0, - "9510": 33307125760.0, - "9515": 33307242496.0, - "9520": 33307670528.0, - "9525": 33307496448.0, - "9530": 33307731968.0, - "9535": 33307435008.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 36905754624.0, - "5": 45014786048.0, - "10": 45173362688.0, - "15": 45173362688.0, - "20": 45251878912.0, - "25": 45286207488.0, - "30": 45286207488.0, - "35": 45288939520.0, - "40": 45288939520.0, - "45": 45288939520.0, - "50": 45288939520.0, - "55": 45288939520.0, - "60": 45288939520.0, - "65": 45288939520.0, - "70": 45288939520.0, - "75": 45288939520.0, - "80": 45288939520.0, - "85": 45288939520.0, - "90": 45288939520.0, - "95": 45288939520.0, - "100": 45288939520.0, - "105": 45288939520.0, - "110": 45299392512.0, - "115": 45314936832.0, - "120": 45378736128.0, - "125": 45428596736.0, - "130": 45428596736.0, - "135": 45445640192.0, - "140": 45445640192.0, - "145": 45445640192.0, - "150": 45445640192.0, - "155": 45445640192.0, - "160": 45445640192.0, - "165": 45445640192.0, - "170": 45445640192.0, - "175": 45445640192.0, - "180": 45445640192.0, - "185": 45445640192.0, - "190": 45445640192.0, - "195": 45445640192.0, - "200": 45536641024.0, - "205": 45638885376.0, - "210": 45638885376.0, - "215": 45638885376.0, - "220": 45638885376.0, - "225": 45638885376.0, - "230": 45638885376.0, - "235": 45713887232.0, - "240": 45932376064.0, - "245": 45982269440.0, - "250": 45982269440.0, - "255": 45982269440.0, - "260": 46039670784.0, - "265": 46039670784.0, - "270": 46039670784.0, - "275": 46039670784.0, - "280": 46293884928.0, - "285": 46293884928.0, - "290": 46293884928.0, - "295": 46293884928.0, - "300": 46293884928.0, - "305": 46319267840.0, - "310": 46319267840.0, - "315": 46319267840.0, - "320": 46319267840.0, - "325": 46319267840.0, - "330": 46319267840.0, - "335": 46319267840.0, - "340": 46319267840.0, - "345": 46451261440.0, - "350": 46451261440.0, - "355": 46451261440.0, - "360": 46451261440.0, - "365": 46451261440.0, - "370": 46451261440.0, - "375": 46451261440.0, - "380": 46451261440.0, - "385": 46451261440.0, - "390": 46451261440.0, - "395": 46451261440.0, - "400": 46451261440.0, - "405": 46451261440.0, - "410": 46451261440.0, - "415": 46451261440.0, - "420": 46451261440.0, - "425": 46451261440.0, - "430": 46451261440.0, - "435": 46451261440.0, - "440": 46451261440.0, - "445": 46451261440.0, - "450": 46451261440.0, - "455": 46451261440.0, - "460": 46451261440.0, - "465": 46451261440.0, - "470": 46451261440.0, - "475": 46451261440.0, - "480": 46451261440.0, - "485": 46451261440.0, - "490": 46451261440.0, - "495": 46451261440.0, - "500": 46451261440.0, - "505": 46451261440.0, - "510": 46451261440.0, - "515": 46451261440.0, - "520": 46451261440.0, - "525": 46451261440.0, - "530": 46451261440.0, - "535": 46451261440.0, - "540": 46451261440.0, - "545": 46451261440.0, - "550": 46451261440.0, - "555": 46451261440.0, - "560": 46451261440.0, - "565": 46451261440.0, - "570": 46451261440.0, - "575": 46451261440.0, - "580": 46451261440.0, - "585": 46451261440.0, - "590": 46451261440.0, - "595": 46451261440.0, - "600": 46451261440.0, - "605": 46451261440.0, - "610": 46451261440.0, - "615": 46451261440.0, - "620": 46451261440.0, - "625": 46451261440.0, - "630": 46451261440.0, - "635": 46451261440.0, - "640": 46451261440.0, - "645": 46451261440.0, - "650": 46451261440.0, - "655": 46451261440.0, - "660": 46451261440.0, - "665": 46451261440.0, - "670": 46451261440.0, - "675": 46451261440.0, - "680": 46451261440.0, - "685": 46451261440.0, - "690": 46451261440.0, - "695": 46451261440.0, - "700": 46451261440.0, - "705": 46451261440.0, - "710": 46451261440.0, - "715": 46451261440.0, - "720": 46451261440.0, - "725": 46451261440.0, - "730": 46451261440.0, - "735": 46451261440.0, - "740": 46451261440.0, - "745": 46451261440.0, - "750": 46451261440.0, - "755": 46451261440.0, - "760": 46451261440.0, - "765": 46451261440.0, - "770": 46451261440.0, - "775": 46451261440.0, - "780": 46451261440.0, - "785": 46451261440.0, - "790": 46451261440.0, - "795": 46451261440.0, - "800": 46451261440.0, - "805": 46451261440.0, - "810": 46451261440.0, - "815": 46451261440.0, - "820": 46451261440.0, - "825": 46451261440.0, - "830": 46451261440.0, - "835": 46451261440.0, - "840": 46451261440.0, - "845": 46451261440.0, - "850": 46451261440.0, - "855": 46451261440.0, - "860": 46451261440.0, - "865": 46451261440.0, - "870": 46451261440.0, - "875": 46451261440.0, - "880": 46451261440.0, - "885": 46451261440.0, - "890": 46451261440.0, - "895": 46451261440.0, - "900": 46451261440.0, - "905": 46451261440.0, - "910": 46451261440.0, - "915": 46451261440.0, - "920": 46451261440.0, - "925": 46451261440.0, - "930": 46451261440.0, - "935": 46451261440.0, - "940": 46451261440.0, - "945": 46451261440.0, - "950": 46451261440.0, - "955": 46451261440.0, - "960": 45564735488.0, - "965": 45952081920.0, - "970": 45952081920.0, - "975": 46005657600.0, - "980": 46005657600.0, - "985": 46005657600.0, - "990": 46005657600.0, - "995": 46169923584.0, - "1000": 46169923584.0, - "1005": 46169923584.0, - "1010": 46169923584.0, - "1015": 46169923584.0, - "1020": 46169923584.0, - "1025": 46169923584.0, - "1030": 46169923584.0, - "1035": 46169923584.0, - "1040": 46169923584.0, - "1045": 46169923584.0, - "1050": 46169923584.0, - "1055": 46169923584.0, - "1060": 46169923584.0, - "1065": 46169923584.0, - "1070": 46169923584.0, - "1075": 46169923584.0, - "1080": 46169923584.0, - "1085": 46169923584.0, - "1090": 46169923584.0, - "1095": 46169923584.0, - "1100": 46169923584.0, - "1105": 46169923584.0, - "1110": 46169923584.0, - "1115": 46169923584.0, - "1120": 46169923584.0, - "1125": 46169923584.0, - "1130": 46169923584.0, - "1135": 46169923584.0, - "1140": 46169923584.0, - "1145": 46169923584.0, - "1150": 46169923584.0, - "1155": 46169923584.0, - "1160": 46169923584.0, - "1165": 46169923584.0, - "1170": 46169923584.0, - "1175": 46169923584.0, - "1180": 46192005120.0, - "1185": 46192005120.0, - "1190": 46192005120.0, - "1195": 46192005120.0, - "1200": 46192005120.0, - "1205": 46192005120.0, - "1210": 46192005120.0, - "1215": 46192005120.0, - "1220": 46192005120.0, - "1225": 46192005120.0, - "1230": 46192005120.0, - "1235": 46192005120.0, - "1240": 46192005120.0, - "1245": 46192005120.0, - "1250": 46192005120.0, - "1255": 46192005120.0, - "1260": 46192005120.0, - "1265": 46192005120.0, - "1270": 46192005120.0, - "1275": 46192005120.0, - "1280": 46192005120.0, - "1285": 46192005120.0, - "1290": 46192005120.0, - "1295": 46192005120.0, - "1300": 46192005120.0, - "1305": 46192005120.0, - "1310": 46192005120.0, - "1315": 46192005120.0, - "1320": 46192005120.0, - "1325": 46192005120.0, - "1330": 46192005120.0, - "1335": 46192005120.0, - "1340": 46192005120.0, - "1345": 46192005120.0, - "1350": 46192005120.0, - "1355": 46192005120.0, - "1360": 46192005120.0, - "1365": 46192005120.0, - "1370": 46192005120.0, - "1375": 46192005120.0, - "1380": 46192005120.0, - "1385": 46192005120.0, - "1390": 46192005120.0, - "1395": 46192005120.0, - "1400": 46192005120.0, - "1405": 46192005120.0, - "1410": 46192005120.0, - "1415": 46192005120.0, - "1420": 46192005120.0, - "1425": 46192005120.0, - "1430": 46192005120.0, - "1435": 46192005120.0, - "1440": 46192005120.0, - "1445": 46192005120.0, - "1450": 46192005120.0, - "1455": 46192005120.0, - "1460": 46192005120.0, - "1465": 46192005120.0, - "1470": 46192005120.0, - "1475": 46192005120.0, - "1480": 46192005120.0, - "1485": 46192005120.0, - "1490": 46192005120.0, - "1495": 46192005120.0, - "1500": 46192005120.0, - "1505": 46192005120.0, - "1510": 46192005120.0, - "1515": 46192005120.0, - "1520": 46192005120.0, - "1525": 46192005120.0, - "1530": 46192005120.0, - "1535": 46192005120.0, - "1540": 46192005120.0, - "1545": 46192005120.0, - "1550": 46260322304.0, - "1555": 46260322304.0, - "1560": 46260322304.0, - "1565": 46260322304.0, - "1570": 46260322304.0, - "1575": 46260322304.0, - "1580": 46260322304.0, - "1585": 46260322304.0, - "1590": 46260322304.0, - "1595": 46260322304.0, - "1600": 46260322304.0, - "1605": 46260322304.0, - "1610": 46260322304.0, - "1615": 46260322304.0, - "1620": 46260322304.0, - "1625": 46260322304.0, - "1630": 46260322304.0, - "1635": 46260322304.0, - "1640": 46260322304.0, - "1645": 46260322304.0, - "1650": 46260322304.0, - "1655": 46260322304.0, - "1660": 46260322304.0, - "1665": 46260322304.0, - "1670": 46260322304.0, - "1675": 46260322304.0, - "1680": 46260322304.0, - "1685": 46260322304.0, - "1690": 46260322304.0, - "1695": 46260322304.0, - "1700": 46260322304.0, - "1705": 46260322304.0, - "1710": 46260322304.0, - "1715": 46260322304.0, - "1720": 46260322304.0, - "1725": 46260322304.0, - "1730": 46260322304.0, - "1735": 46260322304.0, - "1740": 46260322304.0, - "1745": 46260322304.0, - "1750": 46260322304.0, - "1755": 46260322304.0, - "1760": 46260322304.0, - "1765": 46260322304.0, - "1770": 46260322304.0, - "1775": 46260322304.0, - "1780": 46260322304.0, - "1785": 46260322304.0, - "1790": 46260322304.0, - "1795": 46260322304.0, - "1800": 46260322304.0, - "1805": 46260322304.0, - "1810": 46260322304.0, - "1815": 46260322304.0, - "1820": 46260322304.0, - "1825": 46260322304.0, - "1830": 46260322304.0, - "1835": 46260322304.0, - "1840": 46260322304.0, - "1845": 46260322304.0, - "1850": 46260322304.0, - "1855": 46260322304.0, - "1860": 46260322304.0, - "1865": 46260322304.0, - "1870": 46260322304.0, - "1875": 46260322304.0, - "1880": 46260322304.0, - "1885": 46260322304.0, - "1890": 46260322304.0, - "1895": 46260322304.0, - "1900": 46260322304.0, - "1905": 46260322304.0, - "1910": 46260322304.0, - "1915": 46260322304.0, - "1920": 46260322304.0, - "1925": 46260322304.0, - "1930": 46260322304.0, - "1935": 46260322304.0, - "1940": 46260322304.0, - "1945": 46260322304.0, - "1950": 46260322304.0, - "1955": 46260322304.0, - "1960": 46260322304.0, - "1965": 46260322304.0, - "1970": 46260322304.0, - "1975": 46261714944.0, - "1980": 46261714944.0, - "1985": 46261714944.0, - "1990": 46261714944.0, - "1995": 46261714944.0, - "2000": 46261714944.0, - "2005": 46261714944.0, - "2010": 46261714944.0, - "2015": 46261714944.0, - "2020": 46261714944.0, - "2025": 46261714944.0, - "2030": 46261714944.0, - "2035": 46261714944.0, - "2040": 46261714944.0, - "2045": 46261714944.0, - "2050": 46261714944.0, - "2055": 46261714944.0, - "2060": 46261714944.0, - "2065": 46261714944.0, - "2070": 46261714944.0, - "2075": 46261714944.0, - "2080": 46261714944.0, - "2085": 46261714944.0, - "2090": 46261714944.0, - "2095": 46261714944.0, - "2100": 46261714944.0, - "2105": 46261714944.0, - "2110": 46261714944.0, - "2115": 46261714944.0, - "2120": 46261714944.0, - "2125": 46261714944.0, - "2130": 46261714944.0, - "2135": 46261714944.0, - "2140": 46261714944.0, - "2145": 46261714944.0, - "2150": 46261714944.0, - "2155": 46261714944.0, - "2160": 46261714944.0, - "2165": 46261714944.0, - "2170": 46261714944.0, - "2175": 46261714944.0, - "2180": 46261714944.0, - "2185": 46261714944.0, - "2190": 46261714944.0, - "2195": 46261714944.0, - "2200": 46261714944.0, - "2205": 46261714944.0, - "2210": 46261714944.0, - "2215": 46261714944.0, - "2220": 46261714944.0, - "2225": 46261714944.0, - "2230": 46261714944.0, - "2235": 46261714944.0, - "2240": 46261714944.0, - "2245": 46261714944.0, - "2250": 46261714944.0, - "2255": 46261714944.0, - "2260": 46261714944.0, - "2265": 46261714944.0, - "2270": 46261714944.0, - "2275": 46261714944.0, - "2280": 46261714944.0, - "2285": 46261714944.0, - "2290": 46261714944.0, - "2295": 46261714944.0, - "2300": 46261714944.0, - "2305": 46261714944.0, - "2310": 46261714944.0, - "2315": 46261714944.0, - "2320": 46261714944.0, - "2325": 46261714944.0, - "2330": 46261714944.0, - "2335": 46261714944.0, - "2340": 46261714944.0, - "2345": 46261714944.0, - "2350": 46261714944.0, - "2355": 46261714944.0, - "2360": 46261714944.0, - "2365": 46261714944.0, - "2370": 46261714944.0, - "2375": 46261714944.0, - "2380": 46261714944.0, - "2385": 46261714944.0, - "2390": 46261714944.0, - "2395": 46261714944.0, - "2400": 46261714944.0, - "2405": 46261714944.0, - "2410": 46261714944.0, - "2415": 46261714944.0, - "2420": 46261714944.0, - "2425": 46261714944.0, - "2430": 46261714944.0, - "2435": 46261714944.0, - "2440": 46261714944.0, - "2445": 46261714944.0, - "2450": 46261714944.0, - "2455": 46261714944.0, - "2460": 46261714944.0, - "2465": 46261714944.0, - "2470": 46261714944.0, - "2475": 46261714944.0, - "2480": 46261714944.0, - "2485": 46261714944.0, - "2490": 46261714944.0, - "2495": 46261714944.0, - "2500": 46261714944.0, - "2505": 46261714944.0, - "2510": 46261714944.0, - "2515": 46261714944.0, - "2520": 46261714944.0, - "2525": 46261714944.0, - "2530": 46261714944.0, - "2535": 46261714944.0, - "2540": 46261714944.0, - "2545": 46261714944.0, - "2550": 46261714944.0, - "2555": 46261714944.0, - "2560": 46261714944.0, - "2565": 46261714944.0, - "2570": 46261714944.0, - "2575": 46261714944.0, - "2580": 46261714944.0, - "2585": 46261714944.0, - "2590": 46261714944.0, - "2595": 46261714944.0, - "2600": 46261714944.0, - "2605": 46261714944.0, - "2610": 46261714944.0, - "2615": 46261714944.0, - "2620": 46261714944.0, - "2625": 46261714944.0, - "2630": 46261714944.0, - "2635": 46261714944.0, - "2640": 46261714944.0, - "2645": 46261714944.0, - "2650": 46261714944.0, - "2655": 46261714944.0, - "2660": 46261714944.0, - "2665": 46261714944.0, - "2670": 46261714944.0, - "2675": 46261714944.0, - "2680": 46261714944.0, - "2685": 46261714944.0, - "2690": 46261714944.0, - "2695": 46261714944.0, - "2700": 46261714944.0, - "2705": 46261714944.0, - "2710": 46261714944.0, - "2715": 46261714944.0, - "2720": 46261714944.0, - "2725": 46261714944.0, - "2730": 46261714944.0, - "2735": 46261714944.0, - "2740": 46261714944.0, - "2745": 46261714944.0, - "2750": 46261714944.0, - "2755": 46261714944.0, - "2760": 46261714944.0, - "2765": 46261714944.0, - "2770": 46261714944.0, - "2775": 46261714944.0, - "2780": 46261714944.0, - "2785": 46261714944.0, - "2790": 46261714944.0, - "2795": 46261714944.0, - "2800": 46261714944.0, - "2805": 46261714944.0, - "2810": 46261714944.0, - "2815": 46261714944.0, - "2820": 46261714944.0, - "2825": 46261714944.0, - "2830": 46261714944.0, - "2835": 46261714944.0, - "2840": 46261714944.0, - "2845": 46261714944.0, - "2850": 46261714944.0, - "2855": 46261714944.0, - "2860": 46261714944.0, - "2865": 46261714944.0, - "2870": 46261714944.0, - "2875": 46261714944.0, - "2880": 46261714944.0, - "2885": 46261714944.0, - "2890": 46261714944.0, - "2895": 46261714944.0, - "2900": 46261714944.0, - "2905": 46261714944.0, - "2910": 46261714944.0, - "2915": 46261714944.0, - "2920": 46261714944.0, - "2925": 46261714944.0, - "2930": 46261714944.0, - "2935": 46261714944.0, - "2940": 46261714944.0, - "2945": 46261714944.0, - "2950": 46261714944.0, - "2955": 46261714944.0, - "2960": 46261714944.0, - "2965": 46261714944.0, - "2970": 46261714944.0, - "2975": 46261714944.0, - "2980": 46261714944.0, - "2985": 45706711040.0, - "2990": 45883699200.0, - "2995": 46072287232.0, - "3000": 46072287232.0, - "3005": 46072287232.0, - "3010": 46072287232.0, - "3015": 46072287232.0, - "3020": 46072287232.0, - "3025": 46072287232.0, - "3030": 46072287232.0, - "3035": 46072287232.0, - "3040": 46072287232.0, - "3045": 46072287232.0, - "3050": 46072287232.0, - "3055": 46072287232.0, - "3060": 46072287232.0, - "3065": 46072287232.0, - "3070": 46072287232.0, - "3075": 46072287232.0, - "3080": 46072287232.0, - "3085": 46072287232.0, - "3090": 46072287232.0, - "3095": 46072287232.0, - "3100": 46072287232.0, - "3105": 46072287232.0, - "3110": 46072287232.0, - "3115": 46072287232.0, - "3120": 46072287232.0, - "3125": 46072287232.0, - "3130": 46072287232.0, - "3135": 46072287232.0, - "3140": 46072287232.0, - "3145": 46072287232.0, - "3150": 46072287232.0, - "3155": 46072287232.0, - "3160": 46072287232.0, - "3165": 46072287232.0, - "3170": 46072287232.0, - "3175": 46072287232.0, - "3180": 46072287232.0, - "3185": 46072287232.0, - "3190": 46072287232.0, - "3195": 46072287232.0, - "3200": 46072287232.0, - "3205": 46072287232.0, - "3210": 46072287232.0, - "3215": 46072287232.0, - "3220": 46072287232.0, - "3225": 46072287232.0, - "3230": 46072287232.0, - "3235": 46072287232.0, - "3240": 46072287232.0, - "3245": 46072287232.0, - "3250": 46072287232.0, - "3255": 46072287232.0, - "3260": 46072287232.0, - "3265": 46072287232.0, - "3270": 46072287232.0, - "3275": 46072287232.0, - "3280": 46072287232.0, - "3285": 46072287232.0, - "3290": 46072287232.0, - "3295": 46072287232.0, - "3300": 46072287232.0, - "3305": 46072287232.0, - "3310": 46072287232.0, - "3315": 46072287232.0, - "3320": 46072287232.0, - "3325": 46072287232.0, - "3330": 46072287232.0, - "3335": 46072287232.0, - "3340": 46072287232.0, - "3345": 46072287232.0, - "3350": 46072287232.0, - "3355": 46072287232.0, - "3360": 46072287232.0, - "3365": 46072287232.0, - "3370": 46072287232.0, - "3375": 46072287232.0, - "3380": 46072287232.0, - "3385": 46072287232.0, - "3390": 46072287232.0, - "3395": 46072287232.0, - "3400": 46072287232.0, - "3405": 46072287232.0, - "3410": 46072287232.0, - "3415": 46072287232.0, - "3420": 46072287232.0, - "3425": 46072672256.0, - "3430": 46072672256.0, - "3435": 46072672256.0, - "3440": 46072672256.0, - "3445": 46072672256.0, - "3450": 46072672256.0, - "3455": 46072672256.0, - "3460": 46072672256.0, - "3465": 46072672256.0, - "3470": 46072672256.0, - "3475": 46072672256.0, - "3480": 46072672256.0, - "3485": 46095564800.0, - "3490": 46095564800.0, - "3495": 46095564800.0, - "3500": 46095564800.0, - "3505": 46095564800.0, - "3510": 46095564800.0, - "3515": 46095564800.0, - "3520": 46095564800.0, - "3525": 46095564800.0, - "3530": 46095564800.0, - "3535": 46095564800.0, - "3540": 46095564800.0, - "3545": 46095564800.0, - "3550": 46191697920.0, - "3555": 46191697920.0, - "3560": 46191697920.0, - "3565": 46191697920.0, - "3570": 46191697920.0, - "3575": 46191697920.0, - "3580": 46191697920.0, - "3585": 46191697920.0, - "3590": 46191697920.0, - "3595": 46191697920.0, - "3600": 46191697920.0, - "3605": 46191697920.0, - "3610": 46191697920.0, - "3615": 46191697920.0, - "3620": 46191697920.0, - "3625": 46191697920.0, - "3630": 46191697920.0, - "3635": 46191697920.0, - "3640": 46191697920.0, - "3645": 46191697920.0, - "3650": 46191697920.0, - "3655": 46191697920.0, - "3660": 46191697920.0, - "3665": 46191697920.0, - "3670": 46191697920.0, - "3675": 46191697920.0, - "3680": 46191697920.0, - "3685": 46191697920.0, - "3690": 46191697920.0, - "3695": 46191697920.0, - "3700": 46191697920.0, - "3705": 46191697920.0, - "3710": 46191697920.0, - "3715": 46191697920.0, - "3720": 46191697920.0, - "3725": 46191697920.0, - "3730": 46191697920.0, - "3735": 46191697920.0, - "3740": 46191697920.0, - "3745": 46191697920.0, - "3750": 46191697920.0, - "3755": 46191697920.0, - "3760": 46191697920.0, - "3765": 46191697920.0, - "3770": 46191697920.0, - "3775": 46191697920.0, - "3780": 46191697920.0, - "3785": 46191697920.0, - "3790": 46191697920.0, - "3795": 46191697920.0, - "3800": 46191697920.0, - "3805": 46191697920.0, - "3810": 46191697920.0, - "3815": 46191697920.0, - "3820": 46191697920.0, - "3825": 46191697920.0, - "3830": 46191697920.0, - "3835": 46191697920.0, - "3840": 46191697920.0, - "3845": 46191697920.0, - "3850": 46191697920.0, - "3855": 46191697920.0, - "3860": 46191697920.0, - "3865": 46191697920.0, - "3870": 46191697920.0, - "3875": 46191697920.0, - "3880": 46191697920.0, - "3885": 46191697920.0, - "3890": 46191697920.0, - "3895": 46191697920.0, - "3900": 46191697920.0, - "3905": 46191697920.0, - "3910": 46191697920.0, - "3915": 46191697920.0, - "3920": 46191697920.0, - "3925": 46191697920.0, - "3930": 46191697920.0, - "3935": 46191697920.0, - "3940": 46191697920.0, - "3945": 46191697920.0, - "3950": 46191697920.0, - "3955": 46191697920.0, - "3960": 46191697920.0, - "3965": 46191697920.0, - "3970": 46191697920.0, - "3975": 46191697920.0, - "3980": 46191697920.0, - "3985": 46191697920.0, - "3990": 46191697920.0, - "3995": 46191697920.0, - "4000": 45840449536.0, - "4005": 45869191168.0, - "4010": 45897973760.0, - "4015": 45897973760.0, - "4020": 45940301824.0, - "4025": 45940301824.0, - "4030": 45940301824.0, - "4035": 45940301824.0, - "4040": 45940301824.0, - "4045": 45940301824.0, - "4050": 45940301824.0, - "4055": 45940301824.0, - "4060": 45940301824.0, - "4065": 45940301824.0, - "4070": 45940301824.0, - "4075": 45940301824.0, - "4080": 45940301824.0, - "4085": 46009651200.0, - "4090": 46009651200.0, - "4095": 46009651200.0, - "4100": 46009651200.0, - "4105": 46009651200.0, - "4110": 46009651200.0, - "4115": 46009651200.0, - "4120": 46009651200.0, - "4125": 46009651200.0, - "4130": 46009651200.0, - "4135": 46009651200.0, - "4140": 46009651200.0, - "4145": 46009651200.0, - "4150": 46009651200.0, - "4155": 46009651200.0, - "4160": 46009651200.0, - "4165": 46009651200.0, - "4170": 46009651200.0, - "4175": 46009651200.0, - "4180": 46009651200.0, - "4185": 46009651200.0, - "4190": 46009651200.0, - "4195": 46009651200.0, - "4200": 46009651200.0, - "4205": 46009651200.0, - "4210": 46009651200.0, - "4215": 46009651200.0, - "4220": 46009651200.0, - "4225": 46064635904.0, - "4230": 46064635904.0, - "4235": 46064635904.0, - "4240": 46064635904.0, - "4245": 46064635904.0, - "4250": 46064635904.0, - "4255": 46064635904.0, - "4260": 46064635904.0, - "4265": 46064635904.0, - "4270": 46064635904.0, - "4275": 46064635904.0, - "4280": 46064635904.0, - "4285": 46064635904.0, - "4290": 46064635904.0, - "4295": 46064635904.0, - "4300": 46064635904.0, - "4305": 46064635904.0, - "4310": 46064635904.0, - "4315": 46064635904.0, - "4320": 46064635904.0, - "4325": 46064635904.0, - "4330": 46064635904.0, - "4335": 46064635904.0, - "4340": 46064635904.0, - "4345": 46064635904.0, - "4350": 46064635904.0, - "4355": 46064635904.0, - "4360": 46064635904.0, - "4365": 46064635904.0, - "4370": 46064635904.0, - "4375": 46064635904.0, - "4380": 46064635904.0, - "4385": 46064635904.0, - "4390": 46064635904.0, - "4395": 46064635904.0, - "4400": 46064635904.0, - "4405": 46064635904.0, - "4410": 46064635904.0, - "4415": 46064635904.0, - "4420": 46064635904.0, - "4425": 46064635904.0, - "4430": 46064635904.0, - "4435": 46064635904.0, - "4440": 46064635904.0, - "4445": 46064635904.0, - "4450": 46064635904.0, - "4455": 46064635904.0, - "4460": 46080573440.0, - "4465": 46080573440.0, - "4470": 46080573440.0, - "4475": 46080573440.0, - "4480": 46080573440.0, - "4485": 46080573440.0, - "4490": 46080573440.0, - "4495": 46080573440.0, - "4500": 46080573440.0, - "4505": 46080573440.0, - "4510": 46080573440.0, - "4515": 46080573440.0, - "4520": 46080573440.0, - "4525": 46080573440.0, - "4530": 46080573440.0, - "4535": 46080573440.0, - "4540": 46080573440.0, - "4545": 46080573440.0, - "4550": 46080573440.0, - "4555": 46080573440.0, - "4560": 46080573440.0, - "4565": 46080573440.0, - "4570": 46080573440.0, - "4575": 46080573440.0, - "4580": 46080573440.0, - "4585": 46080573440.0, - "4590": 46080573440.0, - "4595": 46080573440.0, - "4600": 46080573440.0, - "4605": 46080573440.0, - "4610": 46080573440.0, - "4615": 46343888896.0, - "4620": 46343888896.0, - "4625": 46343888896.0, - "4630": 46343888896.0, - "4635": 46343888896.0, - "4640": 46343888896.0, - "4645": 46343888896.0, - "4650": 46343888896.0, - "4655": 46343888896.0, - "4660": 46343888896.0, - "4665": 46343888896.0, - "4670": 46343888896.0, - "4675": 46343888896.0, - "4680": 46343888896.0, - "4685": 46343888896.0, - "4690": 46343888896.0, - "4695": 46343888896.0, - "4700": 46343888896.0, - "4705": 46343888896.0, - "4710": 46343888896.0, - "4715": 46343888896.0, - "4720": 46343888896.0, - "4725": 46343888896.0, - "4730": 46343888896.0, - "4735": 46343888896.0, - "4740": 46343888896.0, - "4745": 46343888896.0, - "4750": 46343888896.0, - "4755": 46343888896.0, - "4760": 46343888896.0, - "4765": 46343888896.0, - "4770": 46343888896.0, - "4775": 46343888896.0, - "4780": 46343888896.0, - "4785": 46343888896.0, - "4790": 46343888896.0, - "4795": 46343888896.0, - "4800": 46343888896.0, - "4805": 46343888896.0, - "4810": 46343888896.0, - "4815": 46343888896.0, - "4820": 46343888896.0, - "4825": 46343888896.0, - "4830": 46343888896.0, - "4835": 46343888896.0, - "4840": 46343888896.0, - "4845": 46343888896.0, - "4850": 46343888896.0, - "4855": 46343888896.0, - "4860": 46343888896.0, - "4865": 46343888896.0, - "4870": 46343888896.0, - "4875": 46343888896.0, - "4880": 46343888896.0, - "4885": 46343888896.0, - "4890": 46343888896.0, - "4895": 46343888896.0, - "4900": 46343888896.0, - "4905": 46343888896.0, - "4910": 46343888896.0, - "4915": 46343888896.0, - "4920": 46343888896.0, - "4925": 46343888896.0, - "4930": 46343888896.0, - "4935": 46343888896.0, - "4940": 46343888896.0, - "4945": 46343888896.0, - "4950": 46343888896.0, - "4955": 46343888896.0, - "4960": 46343888896.0, - "4965": 46343888896.0, - "4970": 46343888896.0, - "4975": 46343888896.0, - "4980": 46343888896.0, - "4985": 46343888896.0, - "4990": 46343888896.0, - "4995": 46343888896.0, - "5000": 46343888896.0, - "5005": 46199529472.0, - "5010": 46199529472.0, - "5015": 45764182016.0, - "5020": 45878784000.0, - "5025": 45878784000.0, - "5030": 45878784000.0, - "5035": 45878784000.0, - "5040": 45992685568.0, - "5045": 45992685568.0, - "5050": 45992685568.0, - "5055": 45992685568.0, - "5060": 45992685568.0, - "5065": 45992685568.0, - "5070": 45992685568.0, - "5075": 45992685568.0, - "5080": 45992685568.0, - "5085": 45992685568.0, - "5090": 45992685568.0, - "5095": 46014451712.0, - "5100": 46014451712.0, - "5105": 46014451712.0, - "5110": 46014451712.0, - "5115": 46014451712.0, - "5120": 46014451712.0, - "5125": 46014451712.0, - "5130": 46014451712.0, - "5135": 46014451712.0, - "5140": 46014451712.0, - "5145": 46014451712.0, - "5150": 46014451712.0, - "5155": 46014451712.0, - "5160": 46014451712.0, - "5165": 46014451712.0, - "5170": 46014451712.0, - "5175": 46014451712.0, - "5180": 46014451712.0, - "5185": 46014451712.0, - "5190": 46014451712.0, - "5195": 46014451712.0, - "5200": 46139572224.0, - "5205": 46139572224.0, - "5210": 46139572224.0, - "5215": 46139572224.0, - "5220": 46168403968.0, - "5225": 46168403968.0, - "5230": 46168403968.0, - "5235": 46168403968.0, - "5240": 46168403968.0, - "5245": 46168403968.0, - "5250": 46168403968.0, - "5255": 46168403968.0, - "5260": 46168403968.0, - "5265": 46168403968.0, - "5270": 46168403968.0, - "5275": 46168403968.0, - "5280": 46168403968.0, - "5285": 46168403968.0, - "5290": 46168403968.0, - "5295": 46168403968.0, - "5300": 46168403968.0, - "5305": 46168403968.0, - "5310": 46168403968.0, - "5315": 46168403968.0, - "5320": 46168403968.0, - "5325": 46168403968.0, - "5330": 46168403968.0, - "5335": 46168403968.0, - "5340": 46168403968.0, - "5345": 46168403968.0, - "5350": 46168403968.0, - "5355": 46168403968.0, - "5360": 46168403968.0, - "5365": 46168403968.0, - "5370": 46168403968.0, - "5375": 46168403968.0, - "5380": 46168403968.0, - "5385": 46168403968.0, - "5390": 46168403968.0, - "5395": 46168403968.0, - "5400": 46168403968.0, - "5405": 46168403968.0, - "5410": 46168403968.0, - "5415": 46168403968.0, - "5420": 46168403968.0, - "5425": 46168403968.0, - "5430": 46168403968.0, - "5435": 46168403968.0, - "5440": 46168403968.0, - "5445": 46168403968.0, - "5450": 46168403968.0, - "5455": 46168403968.0, - "5460": 46168403968.0, - "5465": 46168403968.0, - "5470": 46168403968.0, - "5475": 46168403968.0, - "5480": 46168403968.0, - "5485": 46168403968.0, - "5490": 46168403968.0, - "5495": 46168403968.0, - "5500": 46168403968.0, - "5505": 46168403968.0, - "5510": 46168403968.0, - "5515": 46168403968.0, - "5520": 46168403968.0, - "5525": 46168403968.0, - "5530": 46168403968.0, - "5535": 46168403968.0, - "5540": 46168403968.0, - "5545": 46168403968.0, - "5550": 46168403968.0, - "5555": 46168403968.0, - "5560": 46168403968.0, - "5565": 46168403968.0, - "5570": 46168403968.0, - "5575": 46168403968.0, - "5580": 46168403968.0, - "5585": 46168403968.0, - "5590": 46168403968.0, - "5595": 46168403968.0, - "5600": 46168403968.0, - "5605": 46226247680.0, - "5610": 46226247680.0, - "5615": 46226247680.0, - "5620": 46226247680.0, - "5625": 46226247680.0, - "5630": 46226247680.0, - "5635": 46226247680.0, - "5640": 46226247680.0, - "5645": 46226247680.0, - "5650": 46226247680.0, - "5655": 46226247680.0, - "5660": 46226247680.0, - "5665": 46226247680.0, - "5670": 46226247680.0, - "5675": 46226247680.0, - "5680": 46226247680.0, - "5685": 46226247680.0, - "5690": 46226247680.0, - "5695": 46226247680.0, - "5700": 46226247680.0, - "5705": 46226247680.0, - "5710": 46226247680.0, - "5715": 46226247680.0, - "5720": 46226247680.0, - "5725": 46226247680.0, - "5730": 46226247680.0, - "5735": 46226247680.0, - "5740": 46226247680.0, - "5745": 46226247680.0, - "5750": 46226247680.0, - "5755": 46226247680.0, - "5760": 46226247680.0, - "5765": 46226247680.0, - "5770": 46226247680.0, - "5775": 46226247680.0, - "5780": 46226247680.0, - "5785": 46226247680.0, - "5790": 46226247680.0, - "5795": 46226247680.0, - "5800": 46226247680.0, - "5805": 46226247680.0, - "5810": 46226247680.0, - "5815": 46226247680.0, - "5820": 46226247680.0, - "5825": 46226247680.0, - "5830": 46226247680.0, - "5835": 46226247680.0, - "5840": 46226247680.0, - "5845": 46226247680.0, - "5850": 46226247680.0, - "5855": 46226247680.0, - "5860": 46226247680.0, - "5865": 46226247680.0, - "5870": 46226247680.0, - "5875": 46226247680.0, - "5880": 46226247680.0, - "5885": 46226247680.0, - "5890": 46226247680.0, - "5895": 46226247680.0, - "5900": 46226247680.0, - "5905": 46226247680.0, - "5910": 46226247680.0, - "5915": 46226247680.0, - "5920": 46226247680.0, - "5925": 46226247680.0, - "5930": 46226247680.0, - "5935": 46226247680.0, - "5940": 46226247680.0, - "5945": 46226247680.0, - "5950": 46226247680.0, - "5955": 46226247680.0, - "5960": 46226247680.0, - "5965": 46226247680.0, - "5970": 46226247680.0, - "5975": 46226247680.0, - "5980": 46226247680.0, - "5985": 46226247680.0, - "5990": 46226247680.0, - "5995": 46226247680.0, - "6000": 46226247680.0, - "6005": 46226247680.0, - "6010": 46226247680.0, - "6015": 46226247680.0, - "6020": 46226247680.0, - "6025": 46226247680.0, - "6030": 45912186880.0, - "6035": 45912186880.0, - "6040": 45995683840.0, - "6045": 45995683840.0, - "6050": 45995683840.0, - "6055": 45995683840.0, - "6060": 45995683840.0, - "6065": 45995683840.0, - "6070": 45995683840.0, - "6075": 46014836736.0, - "6080": 46014836736.0, - "6085": 46014836736.0, - "6090": 46014836736.0, - "6095": 46014836736.0, - "6100": 46014836736.0, - "6105": 46014836736.0, - "6110": 46014836736.0, - "6115": 46014836736.0, - "6120": 46014836736.0, - "6125": 46014836736.0, - "6130": 46014836736.0, - "6135": 46014836736.0, - "6140": 46014836736.0, - "6145": 46014836736.0, - "6150": 46014836736.0, - "6155": 46014836736.0, - "6160": 46014836736.0, - "6165": 46025334784.0, - "6170": 46025334784.0, - "6175": 46025334784.0, - "6180": 46025334784.0, - "6185": 46035255296.0, - "6190": 46035255296.0, - "6195": 46035255296.0, - "6200": 46035255296.0, - "6205": 46035255296.0, - "6210": 46035255296.0, - "6215": 46035255296.0, - "6220": 46035255296.0, - "6225": 46035255296.0, - "6230": 46035255296.0, - "6235": 46035255296.0, - "6240": 46035255296.0, - "6245": 46035255296.0, - "6250": 46035255296.0, - "6255": 46035255296.0, - "6260": 46035255296.0, - "6265": 46035255296.0, - "6270": 46035255296.0, - "6275": 46035255296.0, - "6280": 46035255296.0, - "6285": 46035255296.0, - "6290": 46035255296.0, - "6295": 46035255296.0, - "6300": 46035255296.0, - "6305": 46035255296.0, - "6310": 46035255296.0, - "6315": 46035255296.0, - "6320": 46035255296.0, - "6325": 46035255296.0, - "6330": 46035255296.0, - "6335": 46035255296.0, - "6340": 46035255296.0, - "6345": 46035255296.0, - "6350": 46035255296.0, - "6355": 46035255296.0, - "6360": 46035255296.0, - "6365": 46035255296.0, - "6370": 46035255296.0, - "6375": 46035255296.0, - "6380": 46035255296.0, - "6385": 46035255296.0, - "6390": 46035255296.0, - "6395": 46035255296.0, - "6400": 46035255296.0, - "6405": 46035255296.0, - "6410": 46035255296.0, - "6415": 46035255296.0, - "6420": 46035255296.0, - "6425": 46035255296.0, - "6430": 46035255296.0, - "6435": 46035255296.0, - "6440": 46035255296.0, - "6445": 46035255296.0, - "6450": 46035255296.0, - "6455": 46035255296.0, - "6460": 46035255296.0, - "6465": 46035255296.0, - "6470": 46035255296.0, - "6475": 46035255296.0, - "6480": 46035255296.0, - "6485": 46035255296.0, - "6490": 46035255296.0, - "6495": 46035255296.0, - "6500": 46035255296.0, - "6505": 46064041984.0, - "6510": 46064041984.0, - "6515": 46064041984.0, - "6520": 46064041984.0, - "6525": 46064041984.0, - "6530": 46064041984.0, - "6535": 46064041984.0, - "6540": 46064041984.0, - "6545": 46064041984.0, - "6550": 46064041984.0, - "6555": 46064041984.0, - "6560": 46064041984.0, - "6565": 46064041984.0, - "6570": 46064041984.0, - "6575": 46064041984.0, - "6580": 46064041984.0, - "6585": 46064041984.0, - "6590": 46064041984.0, - "6595": 46064041984.0, - "6600": 46064041984.0, - "6605": 46064041984.0, - "6610": 46064041984.0, - "6615": 46064041984.0, - "6620": 46064041984.0, - "6625": 46064041984.0, - "6630": 46064041984.0, - "6635": 46064041984.0, - "6640": 46064041984.0, - "6645": 46064041984.0, - "6650": 46064041984.0, - "6655": 46064041984.0, - "6660": 46064041984.0, - "6665": 46064041984.0, - "6670": 46064041984.0, - "6675": 46064041984.0, - "6680": 46064041984.0, - "6685": 46064041984.0, - "6690": 46064041984.0, - "6695": 46064041984.0, - "6700": 46064041984.0, - "6705": 46064041984.0, - "6710": 46064041984.0, - "6715": 46064041984.0, - "6720": 46064041984.0, - "6725": 46064041984.0, - "6730": 46064041984.0, - "6735": 46064041984.0, - "6740": 46064041984.0, - "6745": 46064041984.0, - "6750": 46064041984.0, - "6755": 46064041984.0, - "6760": 46064041984.0, - "6765": 46064041984.0, - "6770": 46064041984.0, - "6775": 46064041984.0, - "6780": 46064041984.0, - "6785": 46064041984.0, - "6790": 46064041984.0, - "6795": 46064041984.0, - "6800": 46064041984.0, - "6805": 46064041984.0, - "6810": 46064041984.0, - "6815": 46064041984.0, - "6820": 46064041984.0, - "6825": 46064041984.0, - "6830": 46064041984.0, - "6835": 46064041984.0, - "6840": 46064041984.0, - "6845": 46064041984.0, - "6850": 46064041984.0, - "6855": 46064041984.0, - "6860": 46064041984.0, - "6865": 46064041984.0, - "6870": 46064041984.0, - "6875": 46064041984.0, - "6880": 46064041984.0, - "6885": 46064041984.0, - "6890": 46064041984.0, - "6895": 46064041984.0, - "6900": 46064041984.0, - "6905": 46064041984.0, - "6910": 46064041984.0, - "6915": 46064041984.0, - "6920": 46064041984.0, - "6925": 46064041984.0, - "6930": 46064041984.0, - "6935": 46064041984.0, - "6940": 46064041984.0, - "6945": 46064041984.0, - "6950": 46064041984.0, - "6955": 46064041984.0, - "6960": 46064041984.0, - "6965": 46064041984.0, - "6970": 46064041984.0, - "6975": 46064041984.0, - "6980": 46064041984.0, - "6985": 46064041984.0, - "6990": 46064041984.0, - "6995": 46064041984.0, - "7000": 46064041984.0, - "7005": 46064041984.0, - "7010": 46064041984.0, - "7015": 46064041984.0, - "7020": 46064041984.0, - "7025": 46064041984.0, - "7030": 46108979200.0, - "7035": 46108979200.0, - "7040": 46108979200.0, - "7045": 46108979200.0, - "7050": 46065532928.0, - "7055": 46065532928.0, - "7060": 46065532928.0, - "7065": 46065532928.0, - "7070": 46065532928.0, - "7075": 46065532928.0, - "7080": 46065532928.0, - "7085": 46065532928.0, - "7090": 46065532928.0, - "7095": 46065532928.0, - "7100": 46065532928.0, - "7105": 46065532928.0, - "7110": 46065532928.0, - "7115": 46065532928.0, - "7120": 46065532928.0, - "7125": 46065532928.0, - "7130": 46065532928.0, - "7135": 46065532928.0, - "7140": 46065532928.0, - "7145": 46065532928.0, - "7150": 46065532928.0, - "7155": 46065532928.0, - "7160": 46065532928.0, - "7165": 46065532928.0, - "7170": 46065532928.0, - "7175": 46065532928.0, - "7180": 46065532928.0, - "7185": 46065532928.0, - "7190": 46065532928.0, - "7195": 46065532928.0, - "7200": 46065532928.0, - "7205": 46065532928.0, - "7210": 46065532928.0, - "7215": 46065532928.0, - "7220": 46065532928.0, - "7225": 46065532928.0, - "7230": 46065532928.0, - "7235": 46065532928.0, - "7240": 46065532928.0, - "7245": 46065532928.0, - "7250": 46065532928.0, - "7255": 46065532928.0, - "7260": 46065532928.0, - "7265": 46065532928.0, - "7270": 46065532928.0, - "7275": 46065532928.0, - "7280": 46065532928.0, - "7285": 46065532928.0, - "7290": 46065532928.0, - "7295": 46065532928.0, - "7300": 46065532928.0, - "7305": 46065532928.0, - "7310": 46065532928.0, - "7315": 46065532928.0, - "7320": 46065532928.0, - "7325": 46065532928.0, - "7330": 46065532928.0, - "7335": 46065532928.0, - "7340": 46065532928.0, - "7345": 46065532928.0, - "7350": 46065532928.0, - "7355": 46065532928.0, - "7360": 46065532928.0, - "7365": 46065532928.0, - "7370": 46065532928.0, - "7375": 46065532928.0, - "7380": 46065532928.0, - "7385": 46065532928.0, - "7390": 46065532928.0, - "7395": 46065532928.0, - "7400": 46065532928.0, - "7405": 46065532928.0, - "7410": 46065532928.0, - "7415": 46065532928.0, - "7420": 46065532928.0, - "7425": 46065532928.0, - "7430": 46065532928.0, - "7435": 46065532928.0, - "7440": 46065532928.0, - "7445": 46065532928.0, - "7450": 46065532928.0, - "7455": 46065532928.0, - "7460": 46065532928.0, - "7465": 46065532928.0, - "7470": 46065532928.0, - "7475": 46065532928.0, - "7480": 46065532928.0, - "7485": 46065532928.0, - "7490": 46065532928.0, - "7495": 46065532928.0, - "7500": 46065532928.0, - "7505": 46065532928.0, - "7510": 46065532928.0, - "7515": 46065532928.0, - "7520": 45618061312.0, - "7525": 45747933184.0, - "7530": 45825024000.0, - "7535": 45825024000.0, - "7540": 45825024000.0, - "7545": 45910597632.0, - "7550": 45910597632.0, - "7555": 45910597632.0, - "7560": 45910597632.0, - "7565": 45910597632.0, - "7570": 45910597632.0, - "7575": 45910597632.0, - "7580": 45910597632.0, - "7585": 45910597632.0, - "7590": 45910597632.0, - "7595": 45916950528.0, - "7600": 45924253696.0, - "7605": 45924253696.0, - "7610": 45924253696.0, - "7615": 45924253696.0, - "7620": 45924253696.0, - "7625": 45924253696.0, - "7630": 45924253696.0, - "7635": 45924253696.0, - "7640": 45924253696.0, - "7645": 45944950784.0, - "7650": 45944950784.0, - "7655": 45944950784.0, - "7660": 45944950784.0, - "7665": 45944950784.0, - "7670": 45944950784.0, - "7675": 45944950784.0, - "7680": 45944950784.0, - "7685": 45944950784.0, - "7690": 45944950784.0, - "7695": 45944950784.0, - "7700": 45944950784.0, - "7705": 45944950784.0, - "7710": 45944950784.0, - "7715": 45944950784.0, - "7720": 45944950784.0, - "7725": 45944950784.0, - "7730": 45944950784.0, - "7735": 45944950784.0, - "7740": 45944950784.0, - "7745": 45944950784.0, - "7750": 45944950784.0, - "7755": 45944950784.0, - "7760": 45944950784.0, - "7765": 45944950784.0, - "7770": 45944950784.0, - "7775": 45944950784.0, - "7780": 45944950784.0, - "7785": 45944950784.0, - "7790": 45944950784.0, - "7795": 45944950784.0, - "7800": 45944950784.0, - "7805": 45944950784.0, - "7810": 45944950784.0, - "7815": 45944950784.0, - "7820": 45944950784.0, - "7825": 45944950784.0, - "7830": 45944950784.0, - "7835": 45944950784.0, - "7840": 45973135360.0, - "7845": 45973135360.0, - "7850": 46089904128.0, - "7855": 46089904128.0, - "7860": 46089904128.0, - "7865": 46089904128.0, - "7870": 46089904128.0, - "7875": 46089904128.0, - "7880": 46089904128.0, - "7885": 46089904128.0, - "7890": 46089904128.0, - "7895": 46089904128.0, - "7900": 46089904128.0, - "7905": 46089904128.0, - "7910": 46089904128.0, - "7915": 46089904128.0, - "7920": 46089904128.0, - "7925": 46089904128.0, - "7930": 46089904128.0, - "7935": 46089904128.0, - "7940": 46089904128.0, - "7945": 46089904128.0, - "7950": 46089904128.0, - "7955": 46089904128.0, - "7960": 46089904128.0, - "7965": 46089904128.0, - "7970": 46089904128.0, - "7975": 46089904128.0, - "7980": 46089904128.0, - "7985": 46089904128.0, - "7990": 46089904128.0, - "7995": 46089904128.0, - "8000": 46089904128.0, - "8005": 46089904128.0, - "8010": 46089904128.0, - "8015": 46089904128.0, - "8020": 46089904128.0, - "8025": 46089904128.0, - "8030": 46089904128.0, - "8035": 46089904128.0, - "8040": 46089904128.0, - "8045": 46089904128.0, - "8050": 46089904128.0, - "8055": 46089904128.0, - "8060": 46089904128.0, - "8065": 46089904128.0, - "8070": 46089904128.0, - "8075": 46089904128.0, - "8080": 46089904128.0, - "8085": 46089904128.0, - "8090": 46089904128.0, - "8095": 46089904128.0, - "8100": 46089904128.0, - "8105": 46089904128.0, - "8110": 46089904128.0, - "8115": 46089904128.0, - "8120": 46089904128.0, - "8125": 46089904128.0, - "8130": 46089904128.0, - "8135": 46089904128.0, - "8140": 46089904128.0, - "8145": 46089904128.0, - "8150": 46089904128.0, - "8155": 46089904128.0, - "8160": 46089904128.0, - "8165": 46089904128.0, - "8170": 46089904128.0, - "8175": 46089904128.0, - "8180": 46089904128.0, - "8185": 46089904128.0, - "8190": 46089904128.0, - "8195": 46089904128.0, - "8200": 46089904128.0, - "8205": 46089904128.0, - "8210": 46089904128.0, - "8215": 46089904128.0, - "8220": 46089904128.0, - "8225": 46089904128.0, - "8230": 46089904128.0, - "8235": 46089904128.0, - "8240": 46089904128.0, - "8245": 46089904128.0, - "8250": 46089904128.0, - "8255": 46089904128.0, - "8260": 46089904128.0, - "8265": 46089904128.0, - "8270": 46089904128.0, - "8275": 46089904128.0, - "8280": 46089904128.0, - "8285": 46089904128.0, - "8290": 46089904128.0, - "8295": 46089904128.0, - "8300": 46089904128.0, - "8305": 46089904128.0, - "8310": 46089904128.0, - "8315": 46089904128.0, - "8320": 46089904128.0, - "8325": 46089904128.0, - "8330": 46089904128.0, - "8335": 46089904128.0, - "8340": 46089904128.0, - "8345": 46089904128.0, - "8350": 46089904128.0, - "8355": 46089904128.0, - "8360": 46089904128.0, - "8365": 46089904128.0, - "8370": 46089904128.0, - "8375": 46089904128.0, - "8380": 46089904128.0, - "8385": 46089904128.0, - "8390": 46089904128.0, - "8395": 46089904128.0, - "8400": 46089904128.0, - "8405": 46089904128.0, - "8410": 46089904128.0, - "8415": 46089904128.0, - "8420": 46089904128.0, - "8425": 46089904128.0, - "8430": 46089904128.0, - "8435": 46089904128.0, - "8440": 46089904128.0, - "8445": 46089904128.0, - "8450": 46089904128.0, - "8455": 46089904128.0, - "8460": 46089904128.0, - "8465": 46089904128.0, - "8470": 46089904128.0, - "8475": 46089904128.0, - "8480": 46089904128.0, - "8485": 46089904128.0, - "8490": 46089904128.0, - "8495": 46089904128.0, - "8500": 46089904128.0, - "8505": 46089904128.0, - "8510": 46089904128.0, - "8515": 46089904128.0, - "8520": 46089904128.0, - "8525": 46089904128.0, - "8530": 45938114560.0, - "8535": 45938114560.0, - "8540": 45938114560.0, - "8545": 45938114560.0, - "8550": 45938114560.0, - "8555": 45938114560.0, - "8560": 45938114560.0, - "8565": 45938114560.0, - "8570": 45938114560.0, - "8575": 45938114560.0, - "8580": 45938114560.0, - "8585": 45938114560.0, - "8590": 45950377984.0, - "8595": 45950377984.0, - "8600": 45950377984.0, - "8605": 45950377984.0, - "8610": 45950377984.0, - "8615": 45950377984.0, - "8620": 45950377984.0, - "8625": 45950377984.0, - "8630": 45950377984.0, - "8635": 45950377984.0, - "8640": 45950377984.0, - "8645": 45950377984.0, - "8650": 45950377984.0, - "8655": 45950377984.0, - "8660": 45950377984.0, - "8665": 45950377984.0, - "8670": 45955510272.0, - "8675": 45955510272.0, - "8680": 45955510272.0, - "8685": 45955510272.0, - "8690": 45991550976.0, - "8695": 45991550976.0, - "8700": 45991550976.0, - "8705": 45991550976.0, - "8710": 45991550976.0, - "8715": 45991550976.0, - "8720": 45991550976.0, - "8725": 45991550976.0, - "8730": 45991550976.0, - "8735": 45991550976.0, - "8740": 46068584448.0, - "8745": 46068584448.0, - "8750": 46068584448.0, - "8755": 46068584448.0, - "8760": 46068584448.0, - "8765": 46068584448.0, - "8770": 46068584448.0, - "8775": 46068584448.0, - "8780": 46068584448.0, - "8785": 46068584448.0, - "8790": 46068584448.0, - "8795": 46068584448.0, - "8800": 46068584448.0, - "8805": 46068584448.0, - "8810": 46068584448.0, - "8815": 46068584448.0, - "8820": 46068584448.0, - "8825": 46068584448.0, - "8830": 46068584448.0, - "8835": 46068584448.0, - "8840": 46068584448.0, - "8845": 46068584448.0, - "8850": 46068584448.0, - "8855": 46184767488.0, - "8860": 46184767488.0, - "8865": 46184767488.0, - "8870": 46184767488.0, - "8875": 46184767488.0, - "8880": 46184767488.0, - "8885": 46184767488.0, - "8890": 46184767488.0, - "8895": 46184767488.0, - "8900": 46184767488.0, - "8905": 46184767488.0, - "8910": 46184767488.0, - "8915": 46184767488.0, - "8920": 46184767488.0, - "8925": 46184767488.0, - "8930": 46184767488.0, - "8935": 46184767488.0, - "8940": 46184767488.0, - "8945": 46184767488.0, - "8950": 46184767488.0, - "8955": 46184767488.0, - "8960": 46184767488.0, - "8965": 46184767488.0, - "8970": 46184767488.0, - "8975": 46184767488.0, - "8980": 46184767488.0, - "8985": 46184767488.0, - "8990": 46184767488.0, - "8995": 46184767488.0, - "9000": 46184767488.0, - "9005": 46184767488.0, - "9010": 46184767488.0, - "9015": 46184767488.0, - "9020": 46184767488.0, - "9025": 46184767488.0, - "9030": 46184767488.0, - "9035": 46184767488.0, - "9040": 46184767488.0, - "9045": 46184767488.0, - "9050": 46184767488.0, - "9055": 46184767488.0, - "9060": 46184767488.0, - "9065": 46184767488.0, - "9070": 46184767488.0, - "9075": 46184767488.0, - "9080": 46184767488.0, - "9085": 46184767488.0, - "9090": 46184767488.0, - "9095": 46184767488.0, - "9100": 46184767488.0, - "9105": 46184767488.0, - "9110": 46184767488.0, - "9115": 46184767488.0, - "9120": 46184767488.0, - "9125": 46184767488.0, - "9130": 46184767488.0, - "9135": 46184767488.0, - "9140": 46184767488.0, - "9145": 46184767488.0, - "9150": 46184767488.0, - "9155": 46184767488.0, - "9160": 46184767488.0, - "9165": 46184767488.0, - "9170": 46184767488.0, - "9175": 46184767488.0, - "9180": 46184767488.0, - "9185": 46184767488.0, - "9190": 46184767488.0, - "9195": 46184767488.0, - "9200": 46184767488.0, - "9205": 46184767488.0, - "9210": 46184767488.0, - "9215": 46184767488.0, - "9220": 46184767488.0, - "9225": 46184767488.0, - "9230": 46184767488.0, - "9235": 46184767488.0, - "9240": 46184767488.0, - "9245": 46184767488.0, - "9250": 46184767488.0, - "9255": 46184767488.0, - "9260": 46184767488.0, - "9265": 46184767488.0, - "9270": 46184767488.0, - "9275": 46184767488.0, - "9280": 46184767488.0, - "9285": 46184767488.0, - "9290": 46184767488.0, - "9295": 46184767488.0, - "9300": 46184767488.0, - "9305": 46184767488.0, - "9310": 46184767488.0, - "9315": 46184767488.0, - "9320": 46184767488.0, - "9325": 46184767488.0, - "9330": 46184767488.0, - "9335": 46184767488.0, - "9340": 46184767488.0, - "9345": 46184767488.0, - "9350": 46184767488.0, - "9355": 46184767488.0, - "9360": 46184767488.0, - "9365": 46184767488.0, - "9370": 46184767488.0, - "9375": 46184767488.0, - "9380": 46184767488.0, - "9385": 46184767488.0, - "9390": 46184767488.0, - "9395": 46184767488.0, - "9400": 46184767488.0, - "9405": 46184767488.0, - "9410": 46184767488.0, - "9415": 46184767488.0, - "9420": 46184767488.0, - "9425": 46184767488.0, - "9430": 46184767488.0, - "9435": 46184767488.0, - "9440": 46184767488.0, - "9445": 46184767488.0, - "9450": 46184767488.0, - "9455": 46184767488.0, - "9460": 46184767488.0, - "9465": 46184767488.0, - "9470": 46184767488.0, - "9475": 46184767488.0, - "9480": 46184767488.0, - "9485": 46184767488.0, - "9490": 46184767488.0, - "9495": 46184767488.0, - "9500": 46184767488.0, - "9505": 46184767488.0, - "9510": 46184767488.0, - "9515": 46184767488.0, - "9520": 46184767488.0, - "9525": 46184767488.0, - "9530": 46184767488.0, - "9535": 46184767488.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 13.88878, - "5": 13.88979, - "10": 13.88767, - "15": 13.88576, - "20": 13.88068, - "25": 13.87774, - "30": 13.85566, - "35": 13.84855, - "40": 13.84546, - "45": 13.82693, - "50": 13.74828, - "55": 13.7249, - "60": 13.70841, - "65": 13.67571, - "70": 13.63981, - "75": 13.44327, - "80": 13.36054, - "85": 13.2835, - "90": 13.18641, - "95": 13.0505, - "100": 12.90733, - "105": 12.74689, - "110": 12.48525, - "115": 12.26801, - "120": 12.04358, - "125": 11.87011, - "130": 11.74911, - "135": 11.5841, - "140": 11.3494, - "145": 11.26997, - "150": 11.11919, - "155": 11.0211, - "160": 10.88133, - "165": 10.75162, - "170": 10.65694, - "175": 10.59566, - "180": 10.43546, - "185": 10.42441, - "190": 10.27183, - "195": 10.2539, - "200": 10.12718, - "205": 9.97472, - "210": 9.94271, - "215": 9.92122, - "220": 9.78944, - "225": 9.77014, - "230": 9.73, - "235": 9.64372, - "240": 9.57366, - "245": 9.50499, - "250": 9.43776, - "255": 9.37037, - "260": 9.29579, - "265": 9.2411, - "270": 9.15629, - "275": 9.12851, - "280": 9.10516, - "285": 9.09815, - "290": 9.01068, - "295": 8.94828, - "300": 8.83207, - "305": 8.80663, - "310": 8.74389, - "315": 8.71813, - "320": 8.68425, - "325": 8.58706, - "330": 8.56208, - "335": 8.53307, - "340": 8.52937, - "345": 8.41091, - "350": 8.39973, - "355": 8.29759, - "360": 8.38348, - "365": 8.28981, - "370": 8.2833, - "375": 8.22588, - "380": 8.18359, - "385": 8.16998, - "390": 8.1467, - "395": 8.09789, - "400": 8.01583, - "405": 8.01349, - "410": 8.00377, - "415": 7.95012, - "420": 7.93109, - "425": 7.88677, - "430": 7.81895, - "435": 7.82989, - "440": 7.77278, - "445": 7.7493, - "450": 7.67877, - "455": 7.7063, - "460": 7.6532, - "465": 7.6329, - "470": 7.59885, - "475": 7.61277, - "480": 7.48436, - "485": 7.53153, - "490": 7.48574, - "495": 7.4714, - "500": 7.41282, - "505": 7.41932, - "510": 7.38698, - "515": 7.35645, - "520": 7.35102, - "525": 7.32559, - "530": 7.32588, - "535": 7.30357, - "540": 7.2179, - "545": 7.24022, - "550": 7.27618, - "555": 7.30238, - "560": 7.23984, - "565": 7.16321, - "570": 7.17228, - "575": 7.18898, - "580": 7.11497, - "585": 7.11901, - "590": 7.06121, - "595": 7.04317, - "600": 7.06682, - "605": 7.06137, - "610": 7.01939, - "615": 7.078, - "620": 6.98113, - "625": 6.95612, - "630": 6.96104, - "635": 6.98871, - "640": 6.96819, - "645": 6.95817, - "650": 7.00625, - "655": 7.00242, - "660": 6.89823, - "665": 6.88159, - "670": 6.84888, - "675": 6.93827, - "680": 6.89638, - "685": 6.85679, - "690": 6.83445, - "695": 6.79719, - "700": 6.79183, - "705": 6.78625, - "710": 6.82275, - "715": 6.82665, - "720": 6.71137, - "725": 6.76643, - "730": 6.75579, - "735": 6.75515, - "740": 6.70045, - "745": 6.67565, - "750": 6.73564, - "755": 6.65767, - "760": 6.66496, - "765": 6.65951, - "770": 6.68075, - "775": 6.65453, - "780": 6.62427, - "785": 6.64321, - "790": 6.59399, - "795": 6.59812, - "800": 6.5878, - "805": 6.65391, - "810": 6.51946, - "815": 6.5419, - "820": 6.55134, - "825": 6.55855, - "830": 6.57041, - "835": 6.52603, - "840": 6.49033, - "845": 6.54438, - "850": 6.49874, - "855": 6.49335, - "860": 6.49024, - "865": 6.49642, - "870": 6.46222, - "875": 6.51054, - "880": 6.4748, - "885": 6.43786, - "890": 6.51246, - "895": 6.39629, - "900": 6.41895, - "905": 6.44341, - "910": 6.40617, - "915": 6.38978, - "920": 6.38772, - "925": 6.37391, - "930": 6.40825, - "935": 6.39755, - "940": 6.34172, - "945": 6.36869, - "950": 6.3953, - "955": 6.34893, - "960": 6.35406, - "965": 6.25416, - "970": 6.32381, - "975": 6.31262, - "980": 6.28797, - "985": 6.29222, - "990": 6.34527, - "995": 6.26326, - "1000": 6.28434, - "1005": 6.23155, - "1010": 6.26712, - "1015": 6.29352, - "1020": 6.20454, - "1025": 6.21082, - "1030": 6.20913, - "1035": 6.29924, - "1040": 6.22531, - "1045": 6.19943, - "1050": 6.2267, - "1055": 6.21777, - "1060": 6.1673, - "1065": 6.15758, - "1070": 6.19281, - "1075": 6.19093, - "1080": 6.19319, - "1085": 6.19606, - "1090": 6.17796, - "1095": 6.181, - "1100": 6.1397, - "1105": 6.11513, - "1110": 6.17787, - "1115": 6.11231, - "1120": 6.05286, - "1125": 6.08699, - "1130": 6.14167, - "1135": 6.09531, - "1140": 6.08221, - "1145": 6.06731, - "1150": 6.09458, - "1155": 6.06298, - "1160": 6.04607, - "1165": 6.09676, - "1170": 6.07336, - "1175": 6.04568, - "1180": 6.05058, - "1185": 6.04124, - "1190": 6.04961, - "1195": 6.02949, - "1200": 5.97329, - "1205": 6.07601, - "1210": 5.93751, - "1215": 5.98403, - "1220": 6.06306, - "1225": 5.95152, - "1230": 5.99877, - "1235": 5.95912, - "1240": 5.99322, - "1245": 5.97187, - "1250": 5.95299, - "1255": 5.94742, - "1260": 5.95227, - "1265": 5.93352, - "1270": 5.90818, - "1275": 5.96805, - "1280": 5.90416, - "1285": 5.92308, - "1290": 5.90725, - "1295": 5.92, - "1300": 5.9267, - "1305": 5.90057, - "1310": 5.83908, - "1315": 5.8992, - "1320": 5.89614, - "1325": 5.8271, - "1330": 5.88462, - "1335": 5.8531, - "1340": 5.91994, - "1345": 5.86667, - "1350": 5.84738, - "1355": 5.84415, - "1360": 5.85216, - "1365": 5.84478, - "1370": 5.79663, - "1375": 5.80667, - "1380": 5.86219, - "1385": 5.81826, - "1390": 5.81231, - "1395": 5.8299, - "1400": 5.83135, - "1405": 5.82032, - "1410": 5.78518, - "1415": 5.77017, - "1420": 5.8049, - "1425": 5.79565, - "1430": 5.83189, - "1435": 5.74562, - "1440": 5.76408, - "1445": 5.8071, - "1450": 5.78859, - "1455": 5.80534, - "1460": 5.75975, - "1465": 5.76379, - "1470": 5.8044, - "1475": 5.76985, - "1480": 5.77563, - "1485": 5.72396, - "1490": 5.72354, - "1495": 5.74538, - "1500": 5.75109, - "1505": 5.72321, - "1510": 5.74832, - "1515": 5.67052, - "1520": 5.70302, - "1525": 5.67385, - "1530": 5.69497, - "1535": 5.68565, - "1540": 5.672, - "1545": 5.7178, - "1550": 5.72274, - "1555": 5.70942, - "1560": 5.65211, - "1565": 5.69926, - "1570": 5.71179, - "1575": 5.6613, - "1580": 5.69275, - "1585": 5.67221, - "1590": 5.66087, - "1595": 5.63673, - "1600": 5.70849, - "1605": 5.64113, - "1610": 5.64353, - "1615": 5.63334, - "1620": 5.65496, - "1625": 5.64982, - "1630": 5.62727, - "1635": 5.67706, - "1640": 5.62761, - "1645": 5.6449, - "1650": 5.63803, - "1655": 5.62499, - "1660": 5.61278, - "1665": 5.60116, - "1670": 5.61214, - "1675": 5.62193, - "1680": 5.56155, - "1685": 5.57098, - "1690": 5.55098, - "1695": 5.55521, - "1700": 5.60178, - "1705": 5.57706, - "1710": 5.58407, - "1715": 5.54721, - "1720": 5.52704, - "1725": 5.56718, - "1730": 5.53148, - "1735": 5.58307, - "1740": 5.52337, - "1745": 5.55772, - "1750": 5.53213, - "1755": 5.5301, - "1760": 5.55304, - "1765": 5.5132, - "1770": 5.522, - "1775": 5.52704, - "1780": 5.53997, - "1785": 5.48896, - "1790": 5.52187, - "1795": 5.52448, - "1800": 5.4698, - "1805": 5.46326, - "1810": 5.47869, - "1815": 5.48464, - "1820": 5.48466, - "1825": 5.48352, - "1830": 5.46909, - "1835": 5.46355, - "1840": 5.46633, - "1845": 5.44723, - "1850": 5.42996, - "1855": 5.4834, - "1860": 5.43502, - "1865": 5.44302, - "1870": 5.43258, - "1875": 5.42823, - "1880": 5.491, - "1885": 5.45039, - "1890": 5.44132, - "1895": 5.38084, - "1900": 5.42123, - "1905": 5.41299, - "1910": 5.43539, - "1915": 5.4013, - "1920": 5.37729, - "1925": 5.4085, - "1930": 5.37579, - "1935": 5.39731, - "1940": 5.3727, - "1945": 5.4174, - "1950": 5.45899, - "1955": 5.39197, - "1960": 5.39342, - "1965": 5.34213, - "1970": 5.34023, - "1975": 5.40413, - "1980": 5.35398, - "1985": 5.37376, - "1990": 5.39658, - "1995": 5.37398, - "2000": 5.38469, - "2005": 5.42838, - "2010": 5.32884, - "2015": 5.32047, - "2020": 5.32991, - "2025": 5.37403, - "2030": 5.31228, - "2035": 5.33119, - "2040": 5.29466, - "2045": 5.38332, - "2050": 5.35716, - "2055": 5.33062, - "2060": 5.32903, - "2065": 5.29751, - "2070": 5.29985, - "2075": 5.32708, - "2080": 5.29709, - "2085": 5.32918, - "2090": 5.24905, - "2095": 5.29587, - "2100": 5.25777, - "2105": 5.28625, - "2110": 5.28042, - "2115": 5.28102, - "2120": 5.2839, - "2125": 5.24699, - "2130": 5.25602, - "2135": 5.25599, - "2140": 5.26607, - "2145": 5.22772, - "2150": 5.24774, - "2155": 5.22588, - "2160": 5.24123, - "2165": 5.22937, - "2170": 5.26626, - "2175": 5.2603, - "2180": 5.24294, - "2185": 5.24675, - "2190": 5.22691, - "2195": 5.20127, - "2200": 5.20409, - "2205": 5.2127, - "2210": 5.25738, - "2215": 5.30103, - "2220": 5.24446, - "2225": 5.2194, - "2230": 5.21789, - "2235": 5.25766, - "2240": 5.16329, - "2245": 5.1607, - "2250": 5.18607, - "2255": 5.19635, - "2260": 5.13701, - "2265": 5.21276, - "2270": 5.14278, - "2275": 5.19722, - "2280": 5.17159, - "2285": 5.18798, - "2290": 5.17456, - "2295": 5.18141, - "2300": 5.17912, - "2305": 5.15551, - "2310": 5.1834, - "2315": 5.12144, - "2320": 5.17039, - "2325": 5.14984, - "2330": 5.15156, - "2335": 5.13195, - "2340": 5.13852, - "2345": 5.18732, - "2350": 5.12945, - "2355": 5.11891, - "2360": 5.10445, - "2365": 5.11898, - "2370": 5.10258, - "2375": 5.11122, - "2380": 5.05395, - "2385": 5.09747, - "2390": 5.11702, - "2395": 5.1322, - "2400": 5.07944, - "2405": 5.06236, - "2410": 5.11554, - "2415": 5.09106, - "2420": 5.10878, - "2425": 5.06863, - "2430": 5.09273, - "2435": 5.08666, - "2440": 5.07515, - "2445": 5.08608, - "2450": 5.04943, - "2455": 5.09523, - "2460": 5.04536, - "2465": 5.08334, - "2470": 5.07644, - "2475": 5.11246, - "2480": 5.02872, - "2485": 5.05906, - "2490": 5.05297, - "2495": 5.04377, - "2500": 5.04447, - "2505": 5.05124, - "2510": 5.0909, - "2515": 5.08005, - "2520": 5.02414, - "2525": 5.03617, - "2530": 5.05281, - "2535": 5.04127, - "2540": 5.04342, - "2545": 5.05498, - "2550": 4.99288, - "2555": 5.05988, - "2560": 5.03403, - "2565": 5.00279, - "2570": 5.02524, - "2575": 4.98811, - "2580": 5.00235, - "2585": 4.98259, - "2590": 5.00195, - "2595": 4.95577, - "2600": 4.99616, - "2605": 5.01565, - "2610": 5.00846, - "2615": 4.9779, - "2620": 4.96, - "2625": 4.99167, - "2630": 4.92069, - "2635": 5.00179, - "2640": 5.00217, - "2645": 4.95857, - "2650": 4.98056, - "2655": 4.97276, - "2660": 4.91658, - "2665": 5.00931, - "2670": 4.95271, - "2675": 4.92627, - "2680": 4.95939, - "2685": 4.9606, - "2690": 4.92299, - "2695": 4.99925, - "2700": 4.90798, - "2705": 4.92161, - "2710": 4.9625, - "2715": 4.94083, - "2720": 4.97062, - "2725": 4.91977, - "2730": 4.9445, - "2735": 4.9369, - "2740": 4.92939, - "2745": 4.89678, - "2750": 4.93832, - "2755": 4.94144, - "2760": 4.94244, - "2765": 4.91315, - "2770": 4.95527, - "2775": 4.90029, - "2780": 4.93753, - "2785": 4.91159, - "2790": 4.93952, - "2795": 4.89812, - "2800": 4.84327, - "2805": 4.89103, - "2810": 4.88284, - "2815": 4.89434, - "2820": 4.93504, - "2825": 4.92479, - "2830": 4.90086, - "2835": 4.90451, - "2840": 4.89553, - "2845": 4.87238, - "2850": 4.90777, - "2855": 4.83628, - "2860": 4.89239, - "2865": 4.90134, - "2870": 4.89048, - "2875": 4.90822, - "2880": 4.82774, - "2885": 4.8758, - "2890": 4.84909, - "2895": 4.88906, - "2900": 4.84436, - "2905": 4.85096, - "2910": 4.84745, - "2915": 4.89554, - "2920": 4.87192, - "2925": 4.84408, - "2930": 4.83304, - "2935": 4.83856, - "2940": 4.8364, - "2945": 4.80087, - "2950": 4.79094, - "2955": 4.79257, - "2960": 4.81394, - "2965": 4.82244, - "2970": 4.83033, - "2975": 4.843, - "2980": 4.78708, - "2985": 4.83546, - "2990": 4.84632, - "2995": 4.79479, - "3000": 4.79957, - "3005": 4.7852, - "3010": 4.81747, - "3015": 4.77707, - "3020": 4.79613, - "3025": 4.80689, - "3030": 4.81521, - "3035": 4.81107, - "3040": 4.83014, - "3045": 4.81253, - "3050": 4.78854, - "3055": 4.79109, - "3060": 4.77291, - "3065": 4.80026, - "3070": 4.82011, - "3075": 4.75177, - "3080": 4.78059, - "3085": 4.7825, - "3090": 4.76596, - "3095": 4.80833, - "3100": 4.79656, - "3105": 4.77177, - "3110": 4.76085, - "3115": 4.71609, - "3120": 4.78235, - "3125": 4.74714, - "3130": 4.75497, - "3135": 4.75435, - "3140": 4.7318, - "3145": 4.71606, - "3150": 4.74842, - "3155": 4.78313, - "3160": 4.765, - "3165": 4.75911, - "3170": 4.7541, - "3175": 4.746, - "3180": 4.73371, - "3185": 4.70655, - "3190": 4.70906, - "3195": 4.70876, - "3200": 4.67795, - "3205": 4.72527, - "3210": 4.67973, - "3215": 4.71138, - "3220": 4.67941, - "3225": 4.71501, - "3230": 4.698, - "3235": 4.73415, - "3240": 4.68214, - "3245": 4.6954, - "3250": 4.64543, - "3255": 4.69551, - "3260": 4.67926, - "3265": 4.72582, - "3270": 4.70744, - "3275": 4.65457, - "3280": 4.68021, - "3285": 4.69583, - "3290": 4.66845, - "3295": 4.67202, - "3300": 4.66858, - "3305": 4.67172, - "3310": 4.66314, - "3315": 4.70829, - "3320": 4.64885, - "3325": 4.65812, - "3330": 4.64245, - "3335": 4.65293, - "3340": 4.62608, - "3345": 4.64548, - "3350": 4.65071, - "3355": 4.65765, - "3360": 4.64823, - "3365": 4.66194, - "3370": 4.63984, - "3375": 4.67722, - "3380": 4.61449, - "3385": 4.62869, - "3390": 4.60608, - "3395": 4.6967, - "3400": 4.64188, - "3405": 4.6721, - "3410": 4.60581, - "3415": 4.55337, - "3420": 4.61467, - "3425": 4.63228, - "3430": 4.66874, - "3435": 4.63419, - "3440": 4.65338, - "3445": 4.60093, - "3450": 4.59889, - "3455": 4.62429, - "3460": 4.58089, - "3465": 4.57689, - "3470": 4.59454, - "3475": 4.60079, - "3480": 4.59374, - "3485": 4.62356, - "3490": 4.60917, - "3495": 4.63221, - "3500": 4.59027, - "3505": 4.59844, - "3510": 4.59797, - "3515": 4.648, - "3520": 4.62554, - "3525": 4.57245, - "3530": 4.58587, - "3535": 4.58174, - "3540": 4.63653, - "3545": 4.56212, - "3550": 4.62056, - "3555": 4.55332, - "3560": 4.62414, - "3565": 4.55473, - "3570": 4.56696, - "3575": 4.53468, - "3580": 4.59878, - "3585": 4.58068, - "3590": 4.51872, - "3595": 4.58848, - "3600": 4.55395, - "3605": 4.53571, - "3610": 4.54008, - "3615": 4.56874, - "3620": 4.61691, - "3625": 4.55023, - "3630": 4.59867, - "3635": 4.50879, - "3640": 4.52782, - "3645": 4.56947, - "3650": 4.53552, - "3655": 4.54665, - "3660": 4.55228, - "3665": 4.58643, - "3670": 4.54047, - "3675": 4.55594, - "3680": 4.57348, - "3685": 4.49418, - "3690": 4.54299, - "3695": 4.49297, - "3700": 4.52866, - "3705": 4.50654, - "3710": 4.51966, - "3715": 4.53, - "3720": 4.50118, - "3725": 4.47886, - "3730": 4.4879, - "3735": 4.50546, - "3740": 4.49399, - "3745": 4.48041, - "3750": 4.51288, - "3755": 4.48915, - "3760": 4.50004, - "3765": 4.47669, - "3770": 4.48984, - "3775": 4.46969, - "3780": 4.45476, - "3785": 4.50898, - "3790": 4.42336, - "3795": 4.4846, - "3800": 4.46028, - "3805": 4.46023, - "3810": 4.42629, - "3815": 4.4806, - "3820": 4.4736, - "3825": 4.4803, - "3830": 4.46747, - "3835": 4.42638, - "3840": 4.52349, - "3845": 4.48225, - "3850": 4.42266, - "3855": 4.46223, - "3860": 4.48001, - "3865": 4.44144, - "3870": 4.50523, - "3875": 4.41439, - "3880": 4.42672, - "3885": 4.44983, - "3890": 4.43819, - "3895": 4.38007, - "3900": 4.43434, - "3905": 4.41283, - "3910": 4.42081, - "3915": 4.42082, - "3920": 4.41329, - "3925": 4.39336, - "3930": 4.41243, - "3935": 4.41903, - "3940": 4.41848, - "3945": 4.39397, - "3950": 4.46098, - "3955": 4.39087, - "3960": 4.43851, - "3965": 4.44901, - "3970": 4.39272, - "3975": 4.40242, - "3980": 4.37236, - "3985": 4.40832, - "3990": 4.40208, - "3995": 4.44335, - "4000": 4.38322, - "4005": 4.37255, - "4010": 4.40982, - "4015": 4.39813, - "4020": 4.43488, - "4025": 4.39111, - "4030": 4.44761, - "4035": 4.40548, - "4040": 4.43553, - "4045": 4.41155, - "4050": 4.40643, - "4055": 4.41393, - "4060": 4.40665, - "4065": 4.41291, - "4070": 4.34904, - "4075": 4.37708, - "4080": 4.35797, - "4085": 4.39736, - "4090": 4.37437, - "4095": 4.35826, - "4100": 4.37323, - "4105": 4.36208, - "4110": 4.32609, - "4115": 4.39421, - "4120": 4.31057, - "4125": 4.31168, - "4130": 4.39302, - "4135": 4.37289, - "4140": 4.31616, - "4145": 4.32788, - "4150": 4.37558, - "4155": 4.29766, - "4160": 4.35633, - "4165": 4.38157, - "4170": 4.32646, - "4175": 4.33285, - "4180": 4.32735, - "4185": 4.31953, - "4190": 4.31017, - "4195": 4.31525, - "4200": 4.31406, - "4205": 4.37, - "4210": 4.32695, - "4215": 4.3562, - "4220": 4.33701, - "4225": 4.32036, - "4230": 4.30579, - "4235": 4.35051, - "4240": 4.30872, - "4245": 4.31564, - "4250": 4.29999, - "4255": 4.31166, - "4260": 4.29019, - "4265": 4.30554, - "4270": 4.29954, - "4275": 4.36276, - "4280": 4.29798, - "4285": 4.33284, - "4290": 4.27741, - "4295": 4.30368, - "4300": 4.32594, - "4305": 4.29066, - "4310": 4.33408, - "4315": 4.3163, - "4320": 4.30571, - "4325": 4.32764, - "4330": 4.26525, - "4335": 4.30418, - "4340": 4.28838, - "4345": 4.23753, - "4350": 4.25927, - "4355": 4.33009, - "4360": 4.30543, - "4365": 4.30411, - "4370": 4.28149, - "4375": 4.24372, - "4380": 4.25559, - "4385": 4.23331, - "4390": 4.30895, - "4395": 4.27518, - "4400": 4.26254, - "4405": 4.23007, - "4410": 4.28048, - "4415": 4.26816, - "4420": 4.24916, - "4425": 4.29252, - "4430": 4.24244, - "4435": 4.29049, - "4440": 4.28601, - "4445": 4.24232, - "4450": 4.20719, - "4455": 4.26016, - "4460": 4.23459, - "4465": 4.25243, - "4470": 4.23841, - "4475": 4.2641, - "4480": 4.24909, - "4485": 4.23389, - "4490": 4.23593, - "4495": 4.17962, - "4500": 4.25444, - "4505": 4.22942, - "4510": 4.23965, - "4515": 4.19566, - "4520": 4.23113, - "4525": 4.19456, - "4530": 4.24001, - "4535": 4.20166, - "4540": 4.21127, - "4545": 4.23188, - "4550": 4.27088, - "4555": 4.2072, - "4560": 4.22378, - "4565": 4.15426, - "4570": 4.21606, - "4575": 4.1941, - "4580": 4.25747, - "4585": 4.22428, - "4590": 4.21266, - "4595": 4.17399, - "4600": 4.16313, - "4605": 4.2045, - "4610": 4.19939, - "4615": 4.24443, - "4620": 4.16447, - "4625": 4.19099, - "4630": 4.20991, - "4635": 4.18208, - "4640": 4.21078, - "4645": 4.20652, - "4650": 4.22758, - "4655": 4.19246, - "4660": 4.18248, - "4665": 4.193, - "4670": 4.23574, - "4675": 4.17989, - "4680": 4.20859, - "4685": 4.19688, - "4690": 4.1723, - "4695": 4.18485, - "4700": 4.16546, - "4705": 4.14067, - "4710": 4.20305, - "4715": 4.19002, - "4720": 4.14737, - "4725": 4.12216, - "4730": 4.17809, - "4735": 4.10178, - "4740": 4.14697, - "4745": 4.18779, - "4750": 4.13615, - "4755": 4.19424, - "4760": 4.1984, - "4765": 4.1461, - "4770": 4.14849, - "4775": 4.14773, - "4780": 4.15523, - "4785": 4.13664, - "4790": 4.19224, - "4795": 4.17628, - "4800": 4.13942, - "4805": 4.17839, - "4810": 4.1375, - "4815": 4.17167, - "4820": 4.12226, - "4825": 4.17474, - "4830": 4.16985, - "4835": 4.14976, - "4840": 4.15298, - "4845": 4.10968, - "4850": 4.17354, - "4855": 4.17639, - "4860": 4.11236, - "4865": 4.13759, - "4870": 4.13215, - "4875": 4.17643, - "4880": 4.1702, - "4885": 4.13029, - "4890": 4.1249, - "4895": 4.12403, - "4900": 4.09958, - "4905": 4.09173, - "4910": 4.09074, - "4915": 4.14665, - "4920": 4.12021, - "4925": 4.08814, - "4930": 4.09778, - "4935": 4.12094, - "4940": 4.04981, - "4945": 4.13369, - "4950": 4.07708, - "4955": 4.15684, - "4960": 4.11652, - "4965": 4.1151, - "4970": 4.09971, - "4975": 4.11736, - "4980": 4.12585, - "4985": 4.12754, - "4990": 4.09005, - "4995": 4.12916, - "5000": 4.05682, - "5005": 4.11701, - "5010": 4.10942, - "5015": 4.07584, - "5020": 4.05201, - "5025": 4.06082, - "5030": 4.10005, - "5035": 4.08177, - "5040": 4.0418, - "5045": 4.11064, - "5050": 4.06425, - "5055": 4.08995, - "5060": 4.03143, - "5065": 4.09666, - "5070": 4.07056, - "5075": 4.12386, - "5080": 4.07795, - "5085": 4.09595, - "5090": 4.07748, - "5095": 4.0424, - "5100": 4.0782, - "5105": 4.0809, - "5110": 4.08612, - "5115": 4.07663, - "5120": 4.09438, - "5125": 4.05976, - "5130": 4.06327, - "5135": 4.0488, - "5140": 4.06922, - "5145": 4.05942, - "5150": 4.07092, - "5155": 4.07553, - "5160": 4.05549, - "5165": 4.09766, - "5170": 3.96642, - "5175": 4.07515, - "5180": 4.03531, - "5185": 4.05861, - "5190": 4.08092, - "5195": 4.04601, - "5200": 4.06577, - "5205": 4.09747, - "5210": 4.01055, - "5215": 4.02373, - "5220": 4.02621, - "5225": 4.02349, - "5230": 4.06271, - "5235": 4.03585, - "5240": 4.02422, - "5245": 4.04177, - "5250": 4.04544, - "5255": 4.03173, - "5260": 4.04798, - "5265": 4.01495, - "5270": 3.98673, - "5275": 4.00519, - "5280": 4.02024, - "5285": 4.04277, - "5290": 4.00304, - "5295": 4.00093, - "5300": 4.02323, - "5305": 4.01012, - "5310": 4.0478, - "5315": 3.99571, - "5320": 4.03864, - "5325": 4.06497, - "5330": 3.99981, - "5335": 4.02122, - "5340": 3.9739, - "5345": 4.01424, - "5350": 4.0246, - "5355": 4.01714, - "5360": 3.9668, - "5365": 3.98455, - "5370": 4.02892, - "5375": 3.99384, - "5380": 3.98952, - "5385": 4.00787, - "5390": 3.99585, - "5395": 3.932, - "5400": 4.02192, - "5405": 3.94401, - "5410": 4.03103, - "5415": 3.94954, - "5420": 3.98108, - "5425": 3.96619, - "5430": 3.97462, - "5435": 4.00917, - "5440": 3.96082, - "5445": 3.96843, - "5450": 3.98078, - "5455": 3.96312, - "5460": 3.97781, - "5465": 4.03343, - "5470": 3.99301, - "5475": 3.92634, - "5480": 4.0001, - "5485": 3.96789, - "5490": 3.99381, - "5495": 3.99755, - "5500": 3.95394, - "5505": 3.9702, - "5510": 4.00139, - "5515": 3.97886, - "5520": 3.95723, - "5525": 4.01089, - "5530": 3.95723, - "5535": 3.99058, - "5540": 3.95888, - "5545": 3.97704, - "5550": 3.97005, - "5555": 3.93134, - "5560": 3.94203, - "5565": 3.98688, - "5570": 3.94409, - "5575": 3.97691, - "5580": 3.95423, - "5585": 3.89232, - "5590": 3.96662, - "5595": 3.91996, - "5600": 3.97099, - "5605": 3.87423, - "5610": 3.96509, - "5615": 3.9629, - "5620": 3.97882, - "5625": 3.95843, - "5630": 3.94884, - "5635": 3.92989, - "5640": 3.95308, - "5645": 3.91537, - "5650": 3.88759, - "5655": 3.91914, - "5660": 3.9101, - "5665": 3.92739, - "5670": 3.91107, - "5675": 3.94487, - "5680": 3.91238, - "5685": 3.92365, - "5690": 3.92517, - "5695": 3.953, - "5700": 3.88996, - "5705": 3.88995, - "5710": 3.87532, - "5715": 3.99623, - "5720": 3.94505, - "5725": 3.89527, - "5730": 3.94792, - "5735": 3.92817, - "5740": 3.92171, - "5745": 3.89897, - "5750": 3.92176, - "5755": 3.94672, - "5760": 3.92632, - "5765": 3.92024, - "5770": 3.95286, - "5775": 3.86965, - "5780": 3.91041, - "5785": 3.91605, - "5790": 3.9236, - "5795": 3.93068, - "5800": 3.86954, - "5805": 3.8764, - "5810": 3.92692, - "5815": 3.89083, - "5820": 3.84021, - "5825": 3.89285, - "5830": 3.85163, - "5835": 3.88292, - "5840": 3.89361, - "5845": 3.91293, - "5850": 3.90508, - "5855": 3.84956, - "5860": 3.87018, - "5865": 3.8979, - "5870": 3.85816, - "5875": 3.89604, - "5880": 3.88075, - "5885": 3.89965, - "5890": 3.90395, - "5895": 3.92339, - "5900": 3.85618, - "5905": 3.92033, - "5910": 3.88782, - "5915": 3.85158, - "5920": 3.88999, - "5925": 3.82174, - "5930": 3.88478, - "5935": 3.86887, - "5940": 3.89924, - "5945": 3.90324, - "5950": 3.88472, - "5955": 3.83758, - "5960": 3.91077, - "5965": 3.85295, - "5970": 3.90592, - "5975": 3.87131, - "5980": 3.94635, - "5985": 3.81828, - "5990": 3.91445, - "5995": 3.82666, - "6000": 3.86389, - "6005": 3.82737, - "6010": 3.84638, - "6015": 3.82528, - "6020": 3.84213, - "6025": 3.8812, - "6030": 3.82864, - "6035": 3.87549, - "6040": 3.85371, - "6045": 3.88892, - "6050": 3.86125, - "6055": 3.84398, - "6060": 3.86538, - "6065": 3.8955, - "6070": 3.844, - "6075": 3.79156, - "6080": 3.86497, - "6085": 3.82767, - "6090": 3.86054, - "6095": 3.85995, - "6100": 3.82399, - "6105": 3.87238, - "6110": 3.80525, - "6115": 3.87931, - "6120": 3.85374, - "6125": 3.85469, - "6130": 3.85122, - "6135": 3.82709, - "6140": 3.8225, - "6145": 3.81264, - "6150": 3.85853, - "6155": 3.83605, - "6160": 3.80232, - "6165": 3.82292, - "6170": 3.81513, - "6175": 3.80691, - "6180": 3.8071, - "6185": 3.84448, - "6190": 3.81178, - "6195": 3.78014, - "6200": 3.80543, - "6205": 3.81219, - "6210": 3.77002, - "6215": 3.82559, - "6220": 3.822, - "6225": 3.82598, - "6230": 3.76955, - "6235": 3.8072, - "6240": 3.73374, - "6245": 3.84624, - "6250": 3.80845, - "6255": 3.8223, - "6260": 3.7948, - "6265": 3.82819, - "6270": 3.75673, - "6275": 3.78492, - "6280": 3.80313, - "6285": 3.78154, - "6290": 3.79976, - "6295": 3.80168, - "6300": 3.80756, - "6305": 3.88253, - "6310": 3.7702, - "6315": 3.7633, - "6320": 3.81817, - "6325": 3.75526, - "6330": 3.82862, - "6335": 3.81943, - "6340": 3.76721, - "6345": 3.82391, - "6350": 3.76718, - "6355": 3.77414, - "6360": 3.75111, - "6365": 3.80986, - "6370": 3.81014, - "6375": 3.78548, - "6380": 3.8065, - "6385": 3.82336, - "6390": 3.78289, - "6395": 3.75935, - "6400": 3.76038, - "6405": 3.83749, - "6410": 3.83127, - "6415": 3.7623, - "6420": 3.82306, - "6425": 3.83219, - "6430": 3.81048, - "6435": 3.77764, - "6440": 3.76108, - "6445": 3.80173, - "6450": 3.73884, - "6455": 3.75156, - "6460": 3.77352, - "6465": 3.80905, - "6470": 3.78701, - "6475": 3.78176, - "6480": 3.81548, - "6485": 3.76414, - "6490": 3.71291, - "6495": 3.81407, - "6500": 3.79809, - "6505": 3.72741, - "6510": 3.7976, - "6515": 3.81938, - "6520": 3.73166, - "6525": 3.80464, - "6530": 3.76853, - "6535": 3.76159, - "6540": 3.82675, - "6545": 3.76261, - "6550": 3.76963, - "6555": 3.75505, - "6560": 3.71108, - "6565": 3.70887, - "6570": 3.7465, - "6575": 3.69338, - "6580": 3.81517, - "6585": 3.76239, - "6590": 3.72546, - "6595": 3.74461, - "6600": 3.73687, - "6605": 3.71668, - "6610": 3.72679, - "6615": 3.76079, - "6620": 3.70966, - "6625": 3.72313, - "6630": 3.72114, - "6635": 3.76232, - "6640": 3.73374, - "6645": 3.75061, - "6650": 3.77922, - "6655": 3.70627, - "6660": 3.73531, - "6665": 3.7573, - "6670": 3.71979, - "6675": 3.74124, - "6680": 3.73477, - "6685": 3.76436, - "6690": 3.74256, - "6695": 3.75545, - "6700": 3.74559, - "6705": 3.72882, - "6710": 3.72913, - "6715": 3.69291, - "6720": 3.77736, - "6725": 3.75737, - "6730": 3.73993, - "6735": 3.74082, - "6740": 3.73806, - "6745": 3.72041, - "6750": 3.74412, - "6755": 3.69337, - "6760": 3.68122, - "6765": 3.74232, - "6770": 3.69625, - "6775": 3.74604, - "6780": 3.70485, - "6785": 3.70942, - "6790": 3.73683, - "6795": 3.69846, - "6800": 3.71752, - "6805": 3.72172, - "6810": 3.73628, - "6815": 3.65876, - "6820": 3.70229, - "6825": 3.72745, - "6830": 3.70872, - "6835": 3.68623, - "6840": 3.67517, - "6845": 3.74818, - "6850": 3.70405, - "6855": 3.73713, - "6860": 3.6695, - "6865": 3.73585, - "6870": 3.6953, - "6875": 3.69781, - "6880": 3.70324, - "6885": 3.67727, - "6890": 3.69236, - "6895": 3.67848, - "6900": 3.68133, - "6905": 3.68771, - "6910": 3.72919, - "6915": 3.73359, - "6920": 3.68934, - "6925": 3.69022, - "6930": 3.68858, - "6935": 3.62056, - "6940": 3.68927, - "6945": 3.67777, - "6950": 3.68038, - "6955": 3.6771, - "6960": 3.68108, - "6965": 3.72225, - "6970": 3.64603, - "6975": 3.72781, - "6980": 3.68459, - "6985": 3.68985, - "6990": 3.7316, - "6995": 3.70495, - "7000": 3.63993, - "7005": 3.71744, - "7010": 3.69223, - "7015": 3.67561, - "7020": 3.72152, - "7025": 3.70969, - "7030": 3.70236, - "7035": 3.65723, - "7040": 3.61488, - "7045": 3.69518, - "7050": 3.71947, - "7055": 3.64991, - "7060": 3.69149, - "7065": 3.74261, - "7070": 3.67108, - "7075": 3.67419, - "7080": 3.71683, - "7085": 3.64191, - "7090": 3.66318, - "7095": 3.63818, - "7100": 3.68341, - "7105": 3.62024, - "7110": 3.68873, - "7115": 3.63797, - "7120": 3.68741, - "7125": 3.63499, - "7130": 3.65311, - "7135": 3.66196, - "7140": 3.66504, - "7145": 3.68183, - "7150": 3.62677, - "7155": 3.69052, - "7160": 3.62415, - "7165": 3.64241, - "7170": 3.68231, - "7175": 3.64603, - "7180": 3.67571, - "7185": 3.70721, - "7190": 3.663, - "7195": 3.66862, - "7200": 3.67265, - "7205": 3.65833, - "7210": 3.68834, - "7215": 3.67282, - "7220": 3.69117, - "7225": 3.66107, - "7230": 3.68593, - "7235": 3.64823, - "7240": 3.64663, - "7245": 3.66574, - "7250": 3.60447, - "7255": 3.62598, - "7260": 3.68023, - "7265": 3.60288, - "7270": 3.63936, - "7275": 3.64805, - "7280": 3.62623, - "7285": 3.65053, - "7290": 3.6735, - "7295": 3.66357, - "7300": 3.62393, - "7305": 3.62784, - "7310": 3.66312, - "7315": 3.67632, - "7320": 3.65015, - "7325": 3.65453, - "7330": 3.62344, - "7335": 3.62574, - "7340": 3.64422, - "7345": 3.60533, - "7350": 3.65727, - "7355": 3.64352, - "7360": 3.61779, - "7365": 3.63578, - "7370": 3.6188, - "7375": 3.59366, - "7380": 3.64743, - "7385": 3.67218, - "7390": 3.65876, - "7395": 3.60688, - "7400": 3.65695, - "7405": 3.64945, - "7410": 3.66151, - "7415": 3.64439, - "7420": 3.63591, - "7425": 3.6844, - "7430": 3.63181, - "7435": 3.61154, - "7440": 3.62564, - "7445": 3.60843, - "7450": 3.57301, - "7455": 3.64772, - "7460": 3.63452, - "7465": 3.63169, - "7470": 3.63744, - "7475": 3.64264, - "7480": 3.61171, - "7485": 3.57567, - "7490": 3.57599, - "7495": 3.5863, - "7500": 3.61565, - "7505": 3.59614, - "7510": 3.55707, - "7515": 3.61683, - "7520": 3.60991, - "7525": 3.56658, - "7530": 3.61196, - "7535": 3.62507, - "7540": 3.61046, - "7545": 3.64639, - "7550": 3.65882, - "7555": 3.58595, - "7560": 3.60212, - "7565": 3.59782, - "7570": 3.60603, - "7575": 3.57351, - "7580": 3.62111, - "7585": 3.60137, - "7590": 3.6026, - "7595": 3.66318, - "7600": 3.6076, - "7605": 3.59626, - "7610": 3.58483, - "7615": 3.58478, - "7620": 3.56787, - "7625": 3.62193, - "7630": 3.60469, - "7635": 3.5928, - "7640": 3.59019, - "7645": 3.62279, - "7650": 3.6259, - "7655": 3.66371, - "7660": 3.5305, - "7665": 3.60545, - "7670": 3.59796, - "7675": 3.58201, - "7680": 3.57701, - "7685": 3.64556, - "7690": 3.59102, - "7695": 3.57063, - "7700": 3.63352, - "7705": 3.58816, - "7710": 3.62048, - "7715": 3.5764, - "7720": 3.65561, - "7725": 3.55706, - "7730": 3.57614, - "7735": 3.61006, - "7740": 3.58168, - "7745": 3.58454, - "7750": 3.57422, - "7755": 3.59202, - "7760": 3.56089, - "7765": 3.58551, - "7770": 3.60104, - "7775": 3.57103, - "7780": 3.55457, - "7785": 3.57713, - "7790": 3.57042, - "7795": 3.58792, - "7800": 3.57997, - "7805": 3.58361, - "7810": 3.60683, - "7815": 3.57773, - "7820": 3.57578, - "7825": 3.61835, - "7830": 3.59192, - "7835": 3.52632, - "7840": 3.6194, - "7845": 3.55538, - "7850": 3.51354, - "7855": 3.56599, - "7860": 3.54645, - "7865": 3.60369, - "7870": 3.54114, - "7875": 3.55695, - "7880": 3.572, - "7885": 3.56229, - "7890": 3.60585, - "7895": 3.59334, - "7900": 3.60641, - "7905": 3.56339, - "7910": 3.58203, - "7915": 3.58298, - "7920": 3.59012, - "7925": 3.5681, - "7930": 3.59927, - "7935": 3.56169, - "7940": 3.60948, - "7945": 3.62723, - "7950": 3.53708, - "7955": 3.54481, - "7960": 3.53124, - "7965": 3.51862, - "7970": 3.52486, - "7975": 3.55975, - "7980": 3.56722, - "7985": 3.54114, - "7990": 3.54399, - "7995": 3.5186, - "8000": 3.57756, - "8005": 3.54643, - "8010": 3.53705, - "8015": 3.53445, - "8020": 3.53111, - "8025": 3.51514, - "8030": 3.54148, - "8035": 3.53478, - "8040": 3.52163, - "8045": 3.57586, - "8050": 3.57789, - "8055": 3.54866, - "8060": 3.5712, - "8065": 3.54757, - "8070": 3.53654, - "8075": 3.52629, - "8080": 3.57467, - "8085": 3.52928, - "8090": 3.53424, - "8095": 3.56313, - "8100": 3.51543, - "8105": 3.54752, - "8110": 3.5453, - "8115": 3.51645, - "8120": 3.52703, - "8125": 3.56437, - "8130": 3.52567, - "8135": 3.53994, - "8140": 3.52104, - "8145": 3.50389, - "8150": 3.52394, - "8155": 3.51178, - "8160": 3.56129, - "8165": 3.54328, - "8170": 3.5116, - "8175": 3.5057, - "8180": 3.57245, - "8185": 3.54733, - "8190": 3.58207, - "8195": 3.55001, - "8200": 3.52156, - "8205": 3.52888, - "8210": 3.53558, - "8215": 3.55713, - "8220": 3.5201, - "8225": 3.51201, - "8230": 3.53756, - "8235": 3.55814, - "8240": 3.54052, - "8245": 3.53652, - "8250": 3.5692, - "8255": 3.51844, - "8260": 3.52912, - "8265": 3.52072, - "8270": 3.52843, - "8275": 3.51526, - "8280": 3.50321, - "8285": 3.52669, - "8290": 3.5272, - "8295": 3.49645, - "8300": 3.51721, - "8305": 3.53958, - "8310": 3.5351, - "8315": 3.50396, - "8320": 3.53046, - "8325": 3.47885, - "8330": 3.44388, - "8335": 3.51457, - "8340": 3.54076, - "8345": 3.49873, - "8350": 3.51134, - "8355": 3.54342, - "8360": 3.51607, - "8365": 3.53716, - "8370": 3.53127, - "8375": 3.48696, - "8380": 3.4848, - "8385": 3.52879, - "8390": 3.49474, - "8395": 3.52721, - "8400": 3.49636, - "8405": 3.51685, - "8410": 3.57651, - "8415": 3.48228, - "8420": 3.45216, - "8425": 3.53401, - "8430": 3.53787, - "8435": 3.47534, - "8440": 3.55163, - "8445": 3.53658, - "8450": 3.50995, - "8455": 3.52875, - "8460": 3.53463, - "8465": 3.4708, - "8470": 3.4929, - "8475": 3.55004, - "8480": 3.47555, - "8485": 3.49487, - "8490": 3.48489, - "8495": 3.48023, - "8500": 3.52888, - "8505": 3.46749, - "8510": 3.54064, - "8515": 3.48982, - "8520": 3.49184, - "8525": 3.42254, - "8530": 3.50181, - "8535": 3.52351, - "8540": 3.47484, - "8545": 3.49944, - "8550": 3.46881, - "8555": 3.53517, - "8560": 3.5346, - "8565": 3.48792, - "8570": 3.48883, - "8575": 3.46414, - "8580": 3.50837, - "8585": 3.52994, - "8590": 3.51956, - "8595": 3.52409, - "8600": 3.50319, - "8605": 3.49079, - "8610": 3.49584, - "8615": 3.49483, - "8620": 3.46525, - "8625": 3.4875, - "8630": 3.49269, - "8635": 3.47742, - "8640": 3.46288, - "8645": 3.52844, - "8650": 3.45936, - "8655": 3.50294, - "8660": 3.51093, - "8665": 3.48996, - "8670": 3.50547, - "8675": 3.47414, - "8680": 3.4685, - "8685": 3.48029, - "8690": 3.51264, - "8695": 3.51367, - "8700": 3.48324, - "8705": 3.45351, - "8710": 3.50031, - "8715": 3.45042, - "8720": 3.52876, - "8725": 3.48819, - "8730": 3.47981, - "8735": 3.51018, - "8740": 3.46013, - "8745": 3.50108, - "8750": 3.50543, - "8755": 3.46564, - "8760": 3.48373, - "8765": 3.43955, - "8770": 3.50951, - "8775": 3.47313, - "8780": 3.45782, - "8785": 3.47628, - "8790": 3.4608, - "8795": 3.49675, - "8800": 3.46402, - "8805": 3.43267, - "8810": 3.45044, - "8815": 3.47281, - "8820": 3.43586, - "8825": 3.46906, - "8830": 3.44494, - "8835": 3.42402, - "8840": 3.4361, - "8845": 3.45772, - "8850": 3.48143, - "8855": 3.46505, - "8860": 3.53187, - "8865": 3.46882, - "8870": 3.44869, - "8875": 3.45286, - "8880": 3.45584, - "8885": 3.44986, - "8890": 3.47298, - "8895": 3.45131, - "8900": 3.47879, - "8905": 3.46796, - "8910": 3.45421, - "8915": 3.44293, - "8920": 3.43345, - "8925": 3.50917, - "8930": 3.49052, - "8935": 3.50073, - "8940": 3.47584, - "8945": 3.47848, - "8950": 3.45717, - "8955": 3.44615, - "8960": 3.43965, - "8965": 3.45818, - "8970": 3.47179, - "8975": 3.42177, - "8980": 3.42266, - "8985": 3.44671, - "8990": 3.50075, - "8995": 3.47255, - "9000": 3.41954, - "9005": 3.46563, - "9010": 3.51573, - "9015": 3.4185, - "9020": 3.43896, - "9025": 3.44768, - "9030": 3.4718, - "9035": 3.37943, - "9040": 3.45501, - "9045": 3.45466, - "9050": 3.49179, - "9055": 3.40312, - "9060": 3.49477, - "9065": 3.51349, - "9070": 3.44713, - "9075": 3.47746, - "9080": 3.47127, - "9085": 3.47459, - "9090": 3.46668, - "9095": 3.42167, - "9100": 3.4227, - "9105": 3.41261, - "9110": 3.45663, - "9115": 3.46481, - "9120": 3.51949, - "9125": 3.44245, - "9130": 3.43654, - "9135": 3.46008, - "9140": 3.47929, - "9145": 3.42408, - "9150": 3.44307, - "9155": 3.45089, - "9160": 3.44998, - "9165": 3.45651, - "9170": 3.47508, - "9175": 3.41133, - "9180": 3.45323, - "9185": 3.41086, - "9190": 3.46875, - "9195": 3.43315, - "9200": 3.44758, - "9205": 3.42373, - "9210": 3.45572, - "9215": 3.39585, - "9220": 3.42327, - "9225": 3.44665, - "9230": 3.37357, - "9235": 3.39456, - "9240": 3.42282, - "9245": 3.40683, - "9250": 3.40791, - "9255": 3.42077, - "9260": 3.39755, - "9265": 3.44216, - "9270": 3.40754, - "9275": 3.42864, - "9280": 3.44334, - "9285": 3.44087, - "9290": 3.45563, - "9295": 3.44456, - "9300": 3.39522, - "9305": 3.42638, - "9310": 3.41593, - "9315": 3.38278, - "9320": 3.3797, - "9325": 3.42046, - "9330": 3.47853, - "9335": 3.38962, - "9340": 3.4706, - "9345": 3.46224, - "9350": 3.42735, - "9355": 3.39326, - "9360": 3.4165, - "9365": 3.41212, - "9370": 3.46155, - "9375": 3.42622, - "9380": 3.36413, - "9385": 3.43469, - "9390": 3.44403, - "9395": 3.45465, - "9400": 3.41582, - "9405": 3.40031, - "9410": 3.43744, - "9415": 3.42574, - "9420": 3.40295, - "9425": 3.42063, - "9430": 3.3935, - "9435": 3.41529, - "9440": 3.40125, - "9445": 3.39961, - "9450": 3.39469, - "9455": 3.4008, - "9460": 3.46489, - "9465": 3.46303, - "9470": 3.40478, - "9475": 3.45335, - "9480": 3.40789, - "9485": 3.3998, - "9490": 3.41154, - "9495": 3.44387, - "9500": 3.40535, - "9505": 3.37735, - "9510": 3.41645, - "9515": 3.41113, - "9520": 3.43045, - "9525": 3.40102, - "9530": 3.40027, - "9535": 3.42216 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 9535, - "step_interval": 5, - "values": { - "1": 241.22832, - "5": 11.6467, - "10": 11.59177, - "15": 11.54982, - "20": 11.50554, - "25": 11.48401, - "30": 11.47019, - "35": 11.4638, - "40": 11.44621, - "45": 11.45505, - "50": 11.48551, - "55": 11.47505, - "60": 11.46559, - "65": 11.69276, - "70": 11.51491, - "75": 11.58841, - "80": 11.59402, - "85": 11.55505, - "90": 11.57827, - "95": 11.6084, - "100": 11.72328, - "105": 11.84735, - "110": 11.81445, - "115": 12.01469, - "120": 12.27052, - "125": 12.40894, - "130": 12.32306, - "135": 12.6537, - "140": 12.87941, - "145": 12.87274, - "150": 13.17646, - "155": 13.42132, - "160": 13.29203, - "165": 13.33468, - "170": 13.38365, - "175": 13.29143, - "180": 13.37704, - "185": 13.17491, - "190": 13.2207, - "195": 13.0407, - "200": 13.03378, - "205": 12.93499, - "210": 12.93302, - "215": 12.83429, - "220": 12.77504, - "225": 12.71437, - "230": 12.67462, - "235": 12.7241, - "240": 12.78341, - "245": 12.61372, - "250": 12.60968, - "255": 12.49502, - "260": 12.38655, - "265": 12.35372, - "270": 12.32939, - "275": 12.25213, - "280": 12.23412, - "285": 12.25047, - "290": 12.1386, - "295": 12.11066, - "300": 12.11487, - "305": 12.08746, - "310": 12.06842, - "315": 12.13334, - "320": 12.12044, - "325": 12.01351, - "330": 11.97276, - "335": 11.951, - "340": 11.97582, - "345": 11.94178, - "350": 11.90942, - "355": 11.9474, - "360": 11.94231, - "365": 11.91539, - "370": 11.89051, - "375": 11.87871, - "380": 11.8539, - "385": 11.81422, - "390": 11.82072, - "395": 11.85516, - "400": 11.8322, - "405": 11.81286, - "410": 11.81008, - "415": 11.76854, - "420": 11.7721, - "425": 11.7287, - "430": 11.80281, - "435": 11.76948, - "440": 11.78237, - "445": 11.81223, - "450": 11.76024, - "455": 11.83905, - "460": 11.86797, - "465": 11.88193, - "470": 11.94544, - "475": 12.03403, - "480": 11.8718, - "485": 11.96463, - "490": 11.9543, - "495": 11.99738, - "500": 12.06608, - "505": 12.04813, - "510": 12.09706, - "515": 12.14335, - "520": 12.36581, - "525": 12.19115, - "530": 12.1887, - "535": 12.25354, - "540": 12.27902, - "545": 12.32347, - "550": 12.44366, - "555": 12.25807, - "560": 12.22369, - "565": 12.28956, - "570": 12.31572, - "575": 12.28835, - "580": 12.33571, - "585": 12.26567, - "590": 12.30079, - "595": 12.29151, - "600": 12.30023, - "605": 12.45501, - "610": 12.27373, - "615": 12.217, - "620": 12.22334, - "625": 12.21274, - "630": 12.21904, - "635": 12.20277, - "640": 12.25538, - "645": 12.19988, - "650": 12.14026, - "655": 12.14302, - "660": 12.14678, - "665": 12.13972, - "670": 12.11485, - "675": 12.0282, - "680": 12.01901, - "685": 11.98462, - "690": 11.98742, - "695": 11.95917, - "700": 11.92521, - "705": 18.38779, - "710": 11.92438, - "715": 11.8274, - "720": 11.90138, - "725": 11.84998, - "730": 11.83009, - "735": 11.89248, - "740": 11.82364, - "745": 11.91839, - "750": 11.9577, - "755": 11.85056, - "760": 11.90523, - "765": 11.9116, - "770": 11.83717, - "775": 12.05864, - "780": 11.84895, - "785": 11.84375, - "790": 11.86493, - "795": 11.85763, - "800": 11.94365, - "805": 11.86899, - "810": 11.86748, - "815": 11.86393, - "820": 11.87992, - "825": 11.85259, - "830": 11.86886, - "835": 11.8517, - "840": 11.86254, - "845": 11.89508, - "850": 11.85613, - "855": 11.87434, - "860": 11.90703, - "865": 11.83224, - "870": 11.88246, - "875": 11.9305, - "880": 11.96022, - "885": 11.81651, - "890": 12.06642, - "895": 11.92653, - "900": 11.86469, - "905": 12.01767, - "910": 11.89635, - "915": 11.8254, - "920": 11.86106, - "925": 11.88434, - "930": 11.97059, - "935": 12.03718, - "940": 11.87698, - "945": 11.88008, - "950": 12.02071, - "955": 11.84843, - "960": 244.37245, - "965": 12.32084, - "970": 11.86341, - "975": 12.01988, - "980": 11.92166, - "985": 11.85411, - "990": 11.87753, - "995": 11.84786, - "1000": 11.89892, - "1005": 11.99759, - "1010": 11.91045, - "1015": 11.87038, - "1020": 11.85674, - "1025": 11.85567, - "1030": 11.86674, - "1035": 11.92499, - "1040": 11.85969, - "1045": 12.04929, - "1050": 11.82341, - "1055": 11.83111, - "1060": 11.87567, - "1065": 11.84584, - "1070": 11.93603, - "1075": 11.87121, - "1080": 11.85935, - "1085": 11.88667, - "1090": 11.86058, - "1095": 11.86482, - "1100": 11.82375, - "1105": 11.86482, - "1110": 11.89668, - "1115": 11.94941, - "1120": 11.84941, - "1125": 11.94466, - "1130": 11.90846, - "1135": 11.8602, - "1140": 11.86926, - "1145": 11.90365, - "1150": 11.88788, - "1155": 11.81781, - "1160": 11.88464, - "1165": 11.85344, - "1170": 11.8865, - "1175": 11.93361, - "1180": 11.89647, - "1185": 11.9031, - "1190": 11.89287, - "1195": 11.88683, - "1200": 11.85927, - "1205": 11.92471, - "1210": 11.85592, - "1215": 17.4276, - "1220": 11.87359, - "1225": 11.9296, - "1230": 11.95025, - "1235": 11.90738, - "1240": 11.86621, - "1245": 11.98001, - "1250": 12.003, - "1255": 11.91396, - "1260": 11.92279, - "1265": 11.85195, - "1270": 11.87463, - "1275": 11.90307, - "1280": 11.84637, - "1285": 11.95883, - "1290": 11.88039, - "1295": 11.8399, - "1300": 11.81976, - "1305": 11.89766, - "1310": 11.91584, - "1315": 12.12571, - "1320": 12.05556, - "1325": 11.84679, - "1330": 11.94985, - "1335": 11.94039, - "1340": 12.00572, - "1345": 11.98268, - "1350": 12.15927, - "1355": 12.04312, - "1360": 11.98816, - "1365": 11.95737, - "1370": 11.92395, - "1375": 11.89595, - "1380": 11.88635, - "1385": 11.96617, - "1390": 11.87421, - "1395": 12.02833, - "1400": 11.87415, - "1405": 11.85875, - "1410": 11.85419, - "1415": 11.8978, - "1420": 11.86309, - "1425": 11.87505, - "1430": 12.10339, - "1435": 11.88151, - "1440": 12.15068, - "1445": 11.98493, - "1450": 11.95438, - "1455": 12.03808, - "1460": 11.85293, - "1465": 11.93176, - "1470": 11.92246, - "1475": 11.90448, - "1480": 11.98959, - "1485": 11.93685, - "1490": 11.92389, - "1495": 11.95047, - "1500": 11.94526, - "1505": 11.9086, - "1510": 11.95225, - "1515": 11.87405, - "1520": 11.87975, - "1525": 11.88264, - "1530": 12.04989, - "1535": 12.02942, - "1540": 11.93089, - "1545": 11.89376, - "1550": 11.88596, - "1555": 11.95001, - "1560": 11.90239, - "1565": 11.89699, - "1570": 11.91441, - "1575": 11.87813, - "1580": 11.86939, - "1585": 11.8566, - "1590": 11.8665, - "1595": 11.90861, - "1600": 11.90425, - "1605": 11.82248, - "1610": 11.86531, - "1615": 11.8796, - "1620": 11.87587, - "1625": 11.88944, - "1630": 11.88839, - "1635": 11.8307, - "1640": 11.87082, - "1645": 11.84687, - "1650": 11.87887, - "1655": 11.85709, - "1660": 11.85167, - "1665": 11.90284, - "1670": 11.85205, - "1675": 12.00742, - "1680": 11.90754, - "1685": 11.97458, - "1690": 11.97016, - "1695": 11.9189, - "1700": 11.89709, - "1705": 11.88042, - "1710": 11.87879, - "1715": 12.06779, - "1720": 11.98631, - "1725": 12.01044, - "1730": 11.9924, - "1735": 11.87648, - "1740": 11.87455, - "1745": 11.93461, - "1750": 11.90235, - "1755": 11.97053, - "1760": 11.89545, - "1765": 11.8564, - "1770": 11.92635, - "1775": 11.91815, - "1780": 11.91235, - "1785": 11.85546, - "1790": 11.93087, - "1795": 11.91138, - "1800": 11.95901, - "1805": 12.0529, - "1810": 11.98858, - "1815": 12.13997, - "1820": 11.94798, - "1825": 11.97682, - "1830": 11.91244, - "1835": 11.94888, - "1840": 11.93666, - "1845": 11.87312, - "1850": 11.86327, - "1855": 11.94769, - "1860": 12.00187, - "1865": 12.06916, - "1870": 11.99528, - "1875": 11.89416, - "1880": 12.02292, - "1885": 12.04249, - "1890": 11.94094, - "1895": 11.93619, - "1900": 11.95301, - "1905": 11.85793, - "1910": 11.96264, - "1915": 11.92826, - "1920": 11.94216, - "1925": 12.01307, - "1930": 11.98891, - "1935": 11.95834, - "1940": 11.92143, - "1945": 11.98459, - "1950": 16.97099, - "1955": 11.89147, - "1960": 11.94643, - "1965": 11.92486, - "1970": 11.91542, - "1975": 13.09741, - "1980": 12.02148, - "1985": 11.92812, - "1990": 12.01102, - "1995": 11.94891, - "2000": 12.06741, - "2005": 11.94166, - "2010": 11.95871, - "2015": 12.00042, - "2020": 11.99101, - "2025": 11.95463, - "2030": 12.36755, - "2035": 11.96199, - "2040": 11.97863, - "2045": 12.01033, - "2050": 12.0643, - "2055": 11.96928, - "2060": 11.98383, - "2065": 11.92648, - "2070": 11.92379, - "2075": 11.97669, - "2080": 11.95508, - "2085": 11.94472, - "2090": 11.9663, - "2095": 11.93695, - "2100": 11.97178, - "2105": 11.98764, - "2110": 11.9516, - "2115": 11.9215, - "2120": 11.95207, - "2125": 11.95947, - "2130": 11.96722, - "2135": 11.97924, - "2140": 11.88777, - "2145": 11.95546, - "2150": 11.90266, - "2155": 11.97573, - "2160": 11.93275, - "2165": 11.98593, - "2170": 11.9842, - "2175": 12.00145, - "2180": 11.99219, - "2185": 11.96424, - "2190": 11.94313, - "2195": 11.93489, - "2200": 11.94356, - "2205": 12.00157, - "2210": 11.97153, - "2215": 11.9563, - "2220": 12.14117, - "2225": 11.97066, - "2230": 12.00037, - "2235": 11.95279, - "2240": 11.9544, - "2245": 11.97031, - "2250": 11.92229, - "2255": 11.98097, - "2260": 11.96529, - "2265": 11.98619, - "2270": 12.02117, - "2275": 11.94865, - "2280": 12.02569, - "2285": 11.98203, - "2290": 12.10479, - "2295": 11.95346, - "2300": 11.99961, - "2305": 11.96025, - "2310": 11.98746, - "2315": 11.95209, - "2320": 12.02644, - "2325": 11.95369, - "2330": 11.91985, - "2335": 11.93244, - "2340": 11.97061, - "2345": 11.90115, - "2350": 11.99136, - "2355": 12.0541, - "2360": 12.03728, - "2365": 11.95319, - "2370": 11.8917, - "2375": 11.94629, - "2380": 11.9087, - "2385": 11.91696, - "2390": 11.90123, - "2395": 11.87998, - "2400": 12.02954, - "2405": 11.97917, - "2410": 11.98456, - "2415": 11.9575, - "2420": 11.95917, - "2425": 11.95788, - "2430": 11.99944, - "2435": 12.00043, - "2440": 11.91339, - "2445": 11.97889, - "2450": 11.93997, - "2455": 11.91834, - "2460": 11.98321, - "2465": 11.94509, - "2470": 11.93387, - "2475": 11.9562, - "2480": 11.93148, - "2485": 11.94432, - "2490": 11.95477, - "2495": 11.94334, - "2500": 11.9284, - "2505": 11.93757, - "2510": 11.92289, - "2515": 11.97869, - "2520": 11.94858, - "2525": 11.96606, - "2530": 11.90894, - "2535": 11.95425, - "2540": 11.89136, - "2545": 11.94553, - "2550": 11.98026, - "2555": 11.93376, - "2560": 11.94866, - "2565": 11.92767, - "2570": 11.93583, - "2575": 11.97284, - "2580": 11.98911, - "2585": 11.95484, - "2590": 11.96399, - "2595": 11.96211, - "2600": 11.93906, - "2605": 11.9733, - "2610": 12.01872, - "2615": 11.99897, - "2620": 11.90926, - "2625": 11.93248, - "2630": 11.92842, - "2635": 11.94338, - "2640": 11.94678, - "2645": 11.95901, - "2650": 11.9296, - "2655": 12.02405, - "2660": 12.0166, - "2665": 12.01166, - "2670": 11.90595, - "2675": 11.98569, - "2680": 12.0118, - "2685": 11.92029, - "2690": 11.93111, - "2695": 12.00369, - "2700": 11.94818, - "2705": 11.99119, - "2710": 11.93978, - "2715": 11.9296, - "2720": 11.93044, - "2725": 11.94343, - "2730": 12.02248, - "2735": 11.95389, - "2740": 11.94611, - "2745": 11.92776, - "2750": 11.91647, - "2755": 11.9522, - "2760": 11.95012, - "2765": 11.96707, - "2770": 11.94892, - "2775": 11.9867, - "2780": 11.96897, - "2785": 11.97268, - "2790": 12.01936, - "2795": 11.97259, - "2800": 12.01028, - "2805": 11.94892, - "2810": 12.04828, - "2815": 11.93469, - "2820": 11.94568, - "2825": 11.92529, - "2830": 11.97458, - "2835": 11.99475, - "2840": 11.94984, - "2845": 11.93356, - "2850": 12.05796, - "2855": 11.99065, - "2860": 11.96077, - "2865": 11.9377, - "2870": 11.97627, - "2875": 11.97986, - "2880": 11.97201, - "2885": 11.91879, - "2890": 11.93586, - "2895": 12.00661, - "2900": 11.94616, - "2905": 11.94376, - "2910": 11.94168, - "2915": 11.94867, - "2920": 11.99355, - "2925": 11.94779, - "2930": 11.97133, - "2935": 11.96256, - "2940": 11.97787, - "2945": 11.93759, - "2950": 11.91863, - "2955": 11.98973, - "2960": 12.00486, - "2965": 11.91623, - "2970": 11.94846, - "2975": 11.91534, - "2980": 11.97787, - "2985": 12.385, - "2990": 11.88498, - "2995": 11.92173, - "3000": 11.90561, - "3005": 11.86795, - "3010": 11.88075, - "3015": 11.87833, - "3020": 11.98777, - "3025": 11.90078, - "3030": 11.98251, - "3035": 11.92211, - "3040": 11.91067, - "3045": 12.04371, - "3050": 11.91886, - "3055": 11.952, - "3060": 11.90649, - "3065": 11.86917, - "3070": 11.86601, - "3075": 11.92435, - "3080": 11.98092, - "3085": 11.94809, - "3090": 12.20304, - "3095": 11.87329, - "3100": 11.92696, - "3105": 11.85799, - "3110": 11.84125, - "3115": 11.82558, - "3120": 11.87566, - "3125": 11.89426, - "3130": 11.85869, - "3135": 11.92893, - "3140": 11.97022, - "3145": 11.84939, - "3150": 11.9785, - "3155": 11.92499, - "3160": 11.8889, - "3165": 11.87938, - "3170": 11.95555, - "3175": 11.91883, - "3180": 11.85842, - "3185": 11.9325, - "3190": 11.86061, - "3195": 11.90479, - "3200": 11.85963, - "3205": 11.91214, - "3210": 11.9243, - "3215": 11.8472, - "3220": 11.86665, - "3225": 11.89836, - "3230": 11.86299, - "3235": 11.89396, - "3240": 11.87482, - "3245": 11.86774, - "3250": 11.86673, - "3255": 11.88133, - "3260": 11.9014, - "3265": 11.92289, - "3270": 11.98401, - "3275": 11.95198, - "3280": 11.87392, - "3285": 11.89268, - "3290": 11.88963, - "3295": 11.91043, - "3300": 11.89803, - "3305": 11.87011, - "3310": 11.84465, - "3315": 11.84015, - "3320": 11.88334, - "3325": 11.93368, - "3330": 11.83472, - "3335": 11.86862, - "3340": 11.87575, - "3345": 11.94875, - "3350": 11.93528, - "3355": 11.81967, - "3360": 11.95954, - "3365": 11.88024, - "3370": 11.88333, - "3375": 11.85751, - "3380": 11.88742, - "3385": 11.9179, - "3390": 11.83242, - "3395": 11.96084, - "3400": 11.88213, - "3405": 11.86112, - "3410": 11.8407, - "3415": 11.92255, - "3420": 11.91997, - "3425": 11.88372, - "3430": 11.8672, - "3435": 11.85235, - "3440": 11.84935, - "3445": 11.93228, - "3450": 11.85166, - "3455": 11.9026, - "3460": 11.99596, - "3465": 11.88838, - "3470": 11.90065, - "3475": 11.92033, - "3480": 11.87265, - "3485": 11.89235, - "3490": 11.89267, - "3495": 11.97544, - "3500": 11.92819, - "3505": 11.82459, - "3510": 11.90756, - "3515": 11.92021, - "3520": 11.88124, - "3525": 11.86983, - "3530": 11.90548, - "3535": 11.94666, - "3540": 11.93322, - "3545": 11.90904, - "3550": 11.85224, - "3555": 11.886, - "3560": 11.93583, - "3565": 11.87294, - "3570": 11.86107, - "3575": 11.83618, - "3580": 11.94649, - "3585": 11.8886, - "3590": 12.01796, - "3595": 11.86065, - "3600": 11.96008, - "3605": 11.94154, - "3610": 11.91928, - "3615": 11.88551, - "3620": 11.8865, - "3625": 11.86807, - "3630": 11.98152, - "3635": 11.87685, - "3640": 11.89995, - "3645": 11.86485, - "3650": 11.94291, - "3655": 11.86472, - "3660": 11.84946, - "3665": 11.90789, - "3670": 11.86396, - "3675": 12.07226, - "3680": 11.8654, - "3685": 11.90154, - "3690": 11.87282, - "3695": 11.84993, - "3700": 11.92847, - "3705": 11.85848, - "3710": 11.86691, - "3715": 11.93176, - "3720": 11.86996, - "3725": 11.92665, - "3730": 11.90876, - "3735": 11.83597, - "3740": 11.8819, - "3745": 11.90119, - "3750": 11.90765, - "3755": 11.89791, - "3760": 11.91124, - "3765": 11.95606, - "3770": 11.93789, - "3775": 11.87152, - "3780": 11.89754, - "3785": 11.8704, - "3790": 11.88079, - "3795": 11.89363, - "3800": 11.88641, - "3805": 11.87724, - "3810": 11.86303, - "3815": 11.96793, - "3820": 11.97071, - "3825": 11.90678, - "3830": 11.84478, - "3835": 11.86339, - "3840": 11.84359, - "3845": 11.85381, - "3850": 11.89843, - "3855": 11.83659, - "3860": 11.8253, - "3865": 11.82796, - "3870": 11.93815, - "3875": 11.87584, - "3880": 11.85716, - "3885": 11.85848, - "3890": 11.84472, - "3895": 11.85001, - "3900": 11.90416, - "3905": 11.87723, - "3910": 11.90409, - "3915": 11.88375, - "3920": 11.9526, - "3925": 11.8796, - "3930": 11.92607, - "3935": 12.02111, - "3940": 11.89989, - "3945": 11.96829, - "3950": 11.92362, - "3955": 11.91298, - "3960": 11.93391, - "3965": 11.9977, - "3970": 11.91134, - "3975": 11.87698, - "3980": 11.84039, - "3985": 11.8296, - "3990": 11.8824, - "3995": 12.03103, - "4000": 12.53061, - "4005": 11.99032, - "4010": 11.94569, - "4015": 12.02459, - "4020": 12.05098, - "4025": 11.9408, - "4030": 11.9872, - "4035": 11.91882, - "4040": 11.91053, - "4045": 11.94764, - "4050": 11.96252, - "4055": 11.92924, - "4060": 11.95584, - "4065": 11.96477, - "4070": 11.95333, - "4075": 11.95009, - "4080": 11.94196, - "4085": 11.96679, - "4090": 12.09863, - "4095": 12.09521, - "4100": 11.99854, - "4105": 12.05345, - "4110": 11.99127, - "4115": 12.05731, - "4120": 11.95072, - "4125": 12.09249, - "4130": 12.04972, - "4135": 11.892, - "4140": 11.93048, - "4145": 11.92862, - "4150": 12.00088, - "4155": 11.95542, - "4160": 12.01499, - "4165": 11.90691, - "4170": 11.99204, - "4175": 12.02661, - "4180": 12.08762, - "4185": 11.93626, - "4190": 11.96513, - "4195": 11.9247, - "4200": 11.89449, - "4205": 11.95353, - "4210": 11.90984, - "4215": 11.92857, - "4220": 11.99809, - "4225": 12.01358, - "4230": 12.00065, - "4235": 11.95146, - "4240": 12.12674, - "4245": 11.99718, - "4250": 11.98808, - "4255": 11.95388, - "4260": 11.91437, - "4265": 11.97358, - "4270": 11.99013, - "4275": 11.95746, - "4280": 11.9273, - "4285": 11.92873, - "4290": 11.94103, - "4295": 11.93054, - "4300": 11.92986, - "4305": 12.11627, - "4310": 11.95471, - "4315": 11.96985, - "4320": 12.03911, - "4325": 12.01041, - "4330": 11.93084, - "4335": 11.95171, - "4340": 12.03209, - "4345": 11.94503, - "4350": 11.95426, - "4355": 12.08714, - "4360": 12.18212, - "4365": 11.94575, - "4370": 11.96598, - "4375": 12.00939, - "4380": 12.08808, - "4385": 11.9772, - "4390": 12.02704, - "4395": 12.01062, - "4400": 11.94619, - "4405": 11.98609, - "4410": 11.98025, - "4415": 11.99156, - "4420": 11.96913, - "4425": 12.02991, - "4430": 11.98417, - "4435": 12.07654, - "4440": 12.09429, - "4445": 11.9962, - "4450": 11.91032, - "4455": 11.99724, - "4460": 11.94549, - "4465": 11.92313, - "4470": 11.98709, - "4475": 11.9946, - "4480": 12.041, - "4485": 11.98684, - "4490": 12.00793, - "4495": 11.96519, - "4500": 11.91768, - "4505": 11.93855, - "4510": 11.96344, - "4515": 11.93266, - "4520": 11.99772, - "4525": 12.00265, - "4530": 12.00144, - "4535": 11.93099, - "4540": 11.9976, - "4545": 12.04415, - "4550": 11.92104, - "4555": 11.97762, - "4560": 12.05513, - "4565": 12.08413, - "4570": 12.00561, - "4575": 12.03402, - "4580": 12.07435, - "4585": 11.91157, - "4590": 11.93266, - "4595": 12.00575, - "4600": 11.98764, - "4605": 12.07608, - "4610": 11.98608, - "4615": 12.23058, - "4620": 11.96992, - "4625": 11.98931, - "4630": 11.92725, - "4635": 11.94909, - "4640": 11.94336, - "4645": 11.95955, - "4650": 11.99978, - "4655": 11.95199, - "4660": 11.97643, - "4665": 12.03686, - "4670": 12.0499, - "4675": 11.98439, - "4680": 12.00394, - "4685": 11.97515, - "4690": 11.95102, - "4695": 12.07552, - "4700": 11.9222, - "4705": 11.97387, - "4710": 11.99203, - "4715": 11.93004, - "4720": 11.97237, - "4725": 12.00277, - "4730": 12.00835, - "4735": 11.97435, - "4740": 11.98233, - "4745": 11.92423, - "4750": 11.95154, - "4755": 12.02084, - "4760": 11.94378, - "4765": 11.95313, - "4770": 11.92338, - "4775": 11.92352, - "4780": 12.00277, - "4785": 11.94768, - "4790": 11.97296, - "4795": 11.98757, - "4800": 12.26361, - "4805": 11.90736, - "4810": 11.9844, - "4815": 12.04212, - "4820": 11.98762, - "4825": 12.89959, - "4830": 11.9442, - "4835": 12.35106, - "4840": 11.93828, - "4845": 11.92418, - "4850": 11.96443, - "4855": 12.03431, - "4860": 12.04422, - "4865": 11.9646, - "4870": 11.91857, - "4875": 11.95672, - "4880": 11.9198, - "4885": 11.96783, - "4890": 11.94953, - "4895": 11.96692, - "4900": 12.04475, - "4905": 12.05877, - "4910": 12.15039, - "4915": 12.15039, - "4920": 11.95008, - "4925": 11.96843, - "4930": 11.958, - "4935": 11.98531, - "4940": 11.90874, - "4945": 11.95752, - "4950": 12.01284, - "4955": 11.97799, - "4960": 11.99989, - "4965": 11.9277, - "4970": 12.06095, - "4975": 11.95713, - "4980": 12.02719, - "4985": 11.96446, - "4990": 11.92043, - "4995": 11.99522, - "5000": 12.0792, - "5005": 11.95462, - "5010": 18.30939, - "5015": 12.57034, - "5020": 12.13652, - "5025": 11.95064, - "5030": 11.93538, - "5035": 12.01779, - "5040": 11.8639, - "5045": 11.89312, - "5050": 11.93054, - "5055": 11.89904, - "5060": 11.88635, - "5065": 11.89505, - "5070": 11.95957, - "5075": 11.96591, - "5080": 11.85594, - "5085": 11.87343, - "5090": 11.89162, - "5095": 11.9231, - "5100": 11.9213, - "5105": 11.9793, - "5110": 11.92942, - "5115": 11.87025, - "5120": 11.84167, - "5125": 11.92967, - "5130": 11.90523, - "5135": 11.8727, - "5140": 11.95822, - "5145": 11.97795, - "5150": 11.90614, - "5155": 11.88276, - "5160": 11.94188, - "5165": 11.91373, - "5170": 12.01192, - "5175": 11.85511, - "5180": 11.84375, - "5185": 11.88965, - "5190": 11.88542, - "5195": 11.85346, - "5200": 11.94188, - "5205": 11.92082, - "5210": 11.8821, - "5215": 11.92239, - "5220": 11.90608, - "5225": 11.8947, - "5230": 11.88619, - "5235": 11.8948, - "5240": 11.89599, - "5245": 11.88662, - "5250": 11.95415, - "5255": 11.96527, - "5260": 11.89009, - "5265": 11.87997, - "5270": 11.94016, - "5275": 11.89138, - "5280": 11.90447, - "5285": 11.86453, - "5290": 11.90845, - "5295": 11.89373, - "5300": 11.96084, - "5305": 12.00505, - "5310": 11.87874, - "5315": 11.94047, - "5320": 11.90115, - "5325": 11.8657, - "5330": 11.98456, - "5335": 11.89142, - "5340": 11.94056, - "5345": 11.88326, - "5350": 12.02941, - "5355": 11.94937, - "5360": 11.84158, - "5365": 11.85236, - "5370": 11.89414, - "5375": 11.92681, - "5380": 11.89983, - "5385": 11.93247, - "5390": 11.88545, - "5395": 11.85963, - "5400": 11.87187, - "5405": 11.92558, - "5410": 11.94364, - "5415": 11.9087, - "5420": 11.86332, - "5425": 11.92767, - "5430": 11.87425, - "5435": 11.91049, - "5440": 11.87699, - "5445": 11.93171, - "5450": 11.90161, - "5455": 11.921, - "5460": 11.88038, - "5465": 11.91315, - "5470": 11.89728, - "5475": 11.95689, - "5480": 11.98965, - "5485": 11.91576, - "5490": 11.89757, - "5495": 11.93064, - "5500": 11.88252, - "5505": 11.96073, - "5510": 11.86654, - "5515": 11.87886, - "5520": 11.90936, - "5525": 12.03373, - "5530": 11.90318, - "5535": 11.92154, - "5540": 11.90086, - "5545": 11.89022, - "5550": 11.90225, - "5555": 11.83513, - "5560": 11.91062, - "5565": 11.87125, - "5570": 11.87145, - "5575": 11.86357, - "5580": 11.91841, - "5585": 11.92436, - "5590": 11.9023, - "5595": 11.86709, - "5600": 11.91375, - "5605": 11.90872, - "5610": 11.8916, - "5615": 11.95578, - "5620": 11.89294, - "5625": 11.90784, - "5630": 11.92391, - "5635": 11.89956, - "5640": 11.89869, - "5645": 11.91776, - "5650": 11.9431, - "5655": 11.89517, - "5660": 11.88968, - "5665": 11.89529, - "5670": 11.91051, - "5675": 11.91888, - "5680": 11.90991, - "5685": 11.93985, - "5690": 11.90708, - "5695": 11.8876, - "5700": 11.95923, - "5705": 11.93355, - "5710": 11.87364, - "5715": 11.9268, - "5720": 11.98226, - "5725": 11.87678, - "5730": 11.83368, - "5735": 11.89468, - "5740": 11.90674, - "5745": 11.88476, - "5750": 11.86646, - "5755": 11.88929, - "5760": 11.85649, - "5765": 11.85565, - "5770": 11.93646, - "5775": 11.90704, - "5780": 12.04897, - "5785": 11.91885, - "5790": 11.90414, - "5795": 11.92795, - "5800": 11.9484, - "5805": 11.9947, - "5810": 11.88562, - "5815": 11.89893, - "5820": 11.86069, - "5825": 11.85602, - "5830": 11.90577, - "5835": 11.90369, - "5840": 11.95291, - "5845": 11.93547, - "5850": 11.89776, - "5855": 11.89365, - "5860": 11.88809, - "5865": 11.89502, - "5870": 11.90093, - "5875": 11.89463, - "5880": 11.85877, - "5885": 11.91775, - "5890": 11.9362, - "5895": 11.90238, - "5900": 11.89416, - "5905": 11.9161, - "5910": 11.91617, - "5915": 11.89704, - "5920": 11.86193, - "5925": 11.94942, - "5930": 11.85147, - "5935": 11.87033, - "5940": 11.9311, - "5945": 11.96348, - "5950": 11.96932, - "5955": 11.90137, - "5960": 11.87563, - "5965": 11.86128, - "5970": 11.99512, - "5975": 11.92846, - "5980": 11.83738, - "5985": 11.88075, - "5990": 11.89265, - "5995": 11.92537, - "6000": 11.88009, - "6005": 11.9523, - "6010": 11.93509, - "6015": 11.89766, - "6020": 11.88045, - "6025": 11.87641, - "6030": 246.60413, - "6035": 12.33879, - "6040": 11.91607, - "6045": 11.95709, - "6050": 11.93381, - "6055": 11.91355, - "6060": 11.91286, - "6065": 11.97819, - "6070": 11.93373, - "6075": 11.85049, - "6080": 11.96747, - "6085": 11.93318, - "6090": 11.93239, - "6095": 11.8622, - "6100": 11.88525, - "6105": 11.97899, - "6110": 11.91577, - "6115": 11.92755, - "6120": 11.92296, - "6125": 11.99725, - "6130": 11.97753, - "6135": 11.92108, - "6140": 11.91607, - "6145": 11.9071, - "6150": 11.92499, - "6155": 11.91611, - "6160": 12.01604, - "6165": 11.89838, - "6170": 11.90254, - "6175": 11.96493, - "6180": 11.84452, - "6185": 11.91052, - "6190": 11.8712, - "6195": 11.90582, - "6200": 11.90605, - "6205": 11.98397, - "6210": 11.92035, - "6215": 11.96579, - "6220": 11.99275, - "6225": 11.88749, - "6230": 11.89369, - "6235": 11.95748, - "6240": 11.93057, - "6245": 11.94912, - "6250": 11.9372, - "6255": 11.90439, - "6260": 11.92527, - "6265": 11.95201, - "6270": 11.9095, - "6275": 11.97821, - "6280": 11.94458, - "6285": 11.90287, - "6290": 11.89278, - "6295": 11.96073, - "6300": 11.90554, - "6305": 11.88653, - "6310": 11.8962, - "6315": 11.93036, - "6320": 11.95396, - "6325": 11.94894, - "6330": 12.04569, - "6335": 11.88055, - "6340": 11.91066, - "6345": 11.89024, - "6350": 11.89994, - "6355": 11.92221, - "6360": 11.92333, - "6365": 11.91761, - "6370": 11.97313, - "6375": 11.90689, - "6380": 12.08922, - "6385": 11.94942, - "6390": 11.91702, - "6395": 11.90139, - "6400": 11.89012, - "6405": 11.9541, - "6410": 12.00044, - "6415": 11.89967, - "6420": 11.86695, - "6425": 11.87294, - "6430": 11.89524, - "6435": 11.94881, - "6440": 11.91361, - "6445": 11.91243, - "6450": 11.90246, - "6455": 11.88301, - "6460": 11.94133, - "6465": 11.95353, - "6470": 11.93545, - "6475": 11.91767, - "6480": 11.904, - "6485": 11.97366, - "6490": 11.9268, - "6495": 11.92497, - "6500": 12.05293, - "6505": 11.83715, - "6510": 11.86732, - "6515": 11.90038, - "6520": 11.86776, - "6525": 11.86971, - "6530": 11.85789, - "6535": 11.88616, - "6540": 11.85825, - "6545": 11.82803, - "6550": 11.89596, - "6555": 11.89246, - "6560": 11.87827, - "6565": 11.87369, - "6570": 11.88103, - "6575": 11.86696, - "6580": 11.90165, - "6585": 11.85113, - "6590": 11.85101, - "6595": 11.80896, - "6600": 11.90596, - "6605": 11.87406, - "6610": 11.8658, - "6615": 11.86475, - "6620": 11.88848, - "6625": 11.85675, - "6630": 11.84722, - "6635": 11.83752, - "6640": 11.8855, - "6645": 11.91332, - "6650": 11.86288, - "6655": 11.89588, - "6660": 11.8071, - "6665": 11.84093, - "6670": 11.88653, - "6675": 11.88047, - "6680": 11.87018, - "6685": 11.8411, - "6690": 11.82244, - "6695": 11.86596, - "6700": 11.85423, - "6705": 11.86228, - "6710": 11.86517, - "6715": 11.87189, - "6720": 11.84138, - "6725": 11.88097, - "6730": 11.90906, - "6735": 11.91578, - "6740": 11.88058, - "6745": 11.88169, - "6750": 12.03575, - "6755": 11.84511, - "6760": 11.84038, - "6765": 11.83499, - "6770": 11.87927, - "6775": 11.81349, - "6780": 13.01048, - "6785": 11.81032, - "6790": 11.93614, - "6795": 11.97801, - "6800": 11.86, - "6805": 11.83039, - "6810": 11.8441, - "6815": 11.89187, - "6820": 11.87841, - "6825": 11.86012, - "6830": 11.83442, - "6835": 11.85081, - "6840": 11.83799, - "6845": 11.82691, - "6850": 11.89092, - "6855": 11.82022, - "6860": 11.8279, - "6865": 11.79814, - "6870": 11.83217, - "6875": 11.90136, - "6880": 11.85295, - "6885": 11.84058, - "6890": 11.84482, - "6895": 11.82768, - "6900": 11.88337, - "6905": 11.84656, - "6910": 11.90272, - "6915": 11.8005, - "6920": 11.93804, - "6925": 12.00166, - "6930": 11.88293, - "6935": 11.9479, - "6940": 11.85228, - "6945": 11.86242, - "6950": 11.83582, - "6955": 11.81523, - "6960": 11.75894, - "6965": 11.81699, - "6970": 11.85282, - "6975": 11.84727, - "6980": 11.84729, - "6985": 12.01189, - "6990": 11.86887, - "6995": 11.88713, - "7000": 11.85612, - "7005": 11.86648, - "7010": 11.8888, - "7015": 11.84573, - "7020": 11.77395, - "7025": 11.85096, - "7030": 11.86323, - "7035": 11.84315, - "7040": 11.82293, - "7045": 11.81241, - "7050": 11.85808, - "7055": 11.86593, - "7060": 11.87475, - "7065": 11.90707, - "7070": 11.9358, - "7075": 11.84297, - "7080": 11.80853, - "7085": 11.88178, - "7090": 11.87836, - "7095": 11.85532, - "7100": 11.89414, - "7105": 11.85379, - "7110": 11.89642, - "7115": 11.85858, - "7120": 11.90327, - "7125": 11.89711, - "7130": 11.89177, - "7135": 11.88659, - "7140": 11.85757, - "7145": 11.87756, - "7150": 11.88577, - "7155": 11.86153, - "7160": 11.92297, - "7165": 11.88396, - "7170": 11.85778, - "7175": 11.91483, - "7180": 11.86232, - "7185": 11.87476, - "7190": 11.8982, - "7195": 11.88516, - "7200": 11.88158, - "7205": 11.88444, - "7210": 11.89206, - "7215": 11.87279, - "7220": 11.90742, - "7225": 11.85079, - "7230": 11.8483, - "7235": 11.90312, - "7240": 11.87181, - "7245": 11.91535, - "7250": 11.87908, - "7255": 11.92293, - "7260": 11.84549, - "7265": 11.8901, - "7270": 11.84322, - "7275": 11.848, - "7280": 11.8967, - "7285": 11.89986, - "7290": 11.95382, - "7295": 11.90753, - "7300": 11.86218, - "7305": 11.85436, - "7310": 11.85753, - "7315": 11.9134, - "7320": 11.90034, - "7325": 11.83407, - "7330": 11.85974, - "7335": 11.90032, - "7340": 11.88835, - "7345": 11.88443, - "7350": 11.85147, - "7355": 11.86003, - "7360": 11.88911, - "7365": 11.88721, - "7370": 11.94597, - "7375": 11.88507, - "7380": 11.8675, - "7385": 11.88615, - "7390": 11.85493, - "7395": 11.9078, - "7400": 11.89976, - "7405": 11.94755, - "7410": 11.86216, - "7415": 11.81832, - "7420": 11.89699, - "7425": 11.90201, - "7430": 11.88324, - "7435": 11.84242, - "7440": 11.89387, - "7445": 11.85554, - "7450": 11.927, - "7455": 11.89196, - "7460": 11.93241, - "7465": 11.89671, - "7470": 11.8633, - "7475": 11.85785, - "7480": 11.86619, - "7485": 11.90047, - "7490": 11.93453, - "7495": 11.89595, - "7500": 11.92255, - "7505": 11.86705, - "7510": 11.86492, - "7515": 11.83778, - "7520": 12.43308, - "7525": 11.94046, - "7530": 12.11911, - "7535": 11.95645, - "7540": 12.01144, - "7545": 11.94459, - "7550": 12.00989, - "7555": 11.95308, - "7560": 12.02894, - "7565": 12.00926, - "7570": 11.88032, - "7575": 11.94986, - "7580": 11.94673, - "7585": 11.92777, - "7590": 11.96311, - "7595": 11.90291, - "7600": 11.96776, - "7605": 11.91009, - "7610": 11.98945, - "7615": 11.943, - "7620": 11.97203, - "7625": 11.87696, - "7630": 11.92313, - "7635": 11.9056, - "7640": 11.89922, - "7645": 11.93063, - "7650": 11.89735, - "7655": 11.93078, - "7660": 11.95494, - "7665": 11.91011, - "7670": 11.97093, - "7675": 11.97514, - "7680": 11.93177, - "7685": 11.8992, - "7690": 11.94571, - "7695": 11.92277, - "7700": 11.94906, - "7705": 11.92727, - "7710": 11.93604, - "7715": 11.92305, - "7720": 11.93766, - "7725": 11.95622, - "7730": 11.90603, - "7735": 11.91132, - "7740": 11.97695, - "7745": 11.96601, - "7750": 11.88967, - "7755": 11.93644, - "7760": 11.96688, - "7765": 11.92672, - "7770": 23.39259, - "7775": 23.06567, - "7780": 11.93112, - "7785": 11.93477, - "7790": 11.94106, - "7795": 11.94556, - "7800": 12.0002, - "7805": 11.97342, - "7810": 11.95163, - "7815": 11.96208, - "7820": 11.96513, - "7825": 11.93368, - "7830": 11.91708, - "7835": 11.89017, - "7840": 11.94549, - "7845": 11.96002, - "7850": 11.95829, - "7855": 11.92186, - "7860": 11.93832, - "7865": 11.889, - "7870": 11.96191, - "7875": 12.05703, - "7880": 11.97288, - "7885": 11.91666, - "7890": 11.93728, - "7895": 11.96047, - "7900": 11.9818, - "7905": 11.92242, - "7910": 11.97684, - "7915": 11.91154, - "7920": 11.96828, - "7925": 11.94506, - "7930": 11.93465, - "7935": 11.90216, - "7940": 11.91383, - "7945": 11.91481, - "7950": 11.96693, - "7955": 11.94446, - "7960": 11.92358, - "7965": 11.94155, - "7970": 11.95822, - "7975": 12.03469, - "7980": 11.94102, - "7985": 11.94681, - "7990": 11.92459, - "7995": 11.92763, - "8000": 11.96299, - "8005": 11.9788, - "8010": 11.96826, - "8015": 12.02982, - "8020": 11.94329, - "8025": 11.98105, - "8030": 12.01501, - "8035": 11.96502, - "8040": 11.97586, - "8045": 11.96948, - "8050": 11.92611, - "8055": 11.93414, - "8060": 11.93961, - "8065": 11.9262, - "8070": 11.9178, - "8075": 11.90325, - "8080": 11.93833, - "8085": 11.97936, - "8090": 11.99724, - "8095": 11.94796, - "8100": 11.9625, - "8105": 11.94798, - "8110": 11.92353, - "8115": 11.96357, - "8120": 11.92451, - "8125": 11.89352, - "8130": 11.97563, - "8135": 11.97236, - "8140": 11.9723, - "8145": 11.92641, - "8150": 11.89834, - "8155": 11.94876, - "8160": 11.95465, - "8165": 11.95874, - "8170": 11.93402, - "8175": 11.96745, - "8180": 11.91172, - "8185": 11.91331, - "8190": 11.95504, - "8195": 11.94346, - "8200": 11.95192, - "8205": 11.9973, - "8210": 11.95023, - "8215": 12.03521, - "8220": 11.96486, - "8225": 11.95464, - "8230": 11.96151, - "8235": 11.95994, - "8240": 11.97909, - "8245": 11.92928, - "8250": 11.92518, - "8255": 11.94881, - "8260": 11.907, - "8265": 11.93185, - "8270": 11.9211, - "8275": 11.86366, - "8280": 12.00914, - "8285": 11.97086, - "8290": 11.98208, - "8295": 11.92309, - "8300": 11.94129, - "8305": 11.99302, - "8310": 11.97601, - "8315": 11.88862, - "8320": 11.96454, - "8325": 11.89961, - "8330": 11.99534, - "8335": 11.91687, - "8340": 11.96466, - "8345": 11.93152, - "8350": 11.94368, - "8355": 11.92235, - "8360": 11.99578, - "8365": 11.90045, - "8370": 11.91744, - "8375": 11.92667, - "8380": 11.90428, - "8385": 11.94828, - "8390": 11.93507, - "8395": 11.9473, - "8400": 11.94267, - "8405": 11.93414, - "8410": 11.90959, - "8415": 11.92941, - "8420": 11.91201, - "8425": 11.91625, - "8430": 11.9332, - "8435": 11.99456, - "8440": 11.8869, - "8445": 11.90729, - "8450": 11.93362, - "8455": 11.96619, - "8460": 12.01359, - "8465": 11.9429, - "8470": 11.99594, - "8475": 11.95465, - "8480": 11.92489, - "8485": 11.92415, - "8490": 11.97388, - "8495": 11.89913, - "8500": 11.95945, - "8505": 11.91567, - "8510": 11.91482, - "8515": 11.93548, - "8520": 11.95743, - "8525": 11.94743, - "8530": 12.42097, - "8535": 11.9272, - "8540": 12.09436, - "8545": 12.04967, - "8550": 11.9651, - "8555": 12.03857, - "8560": 11.97265, - "8565": 11.91082, - "8570": 11.95406, - "8575": 11.94802, - "8580": 11.9942, - "8585": 11.96288, - "8590": 11.95701, - "8595": 11.97786, - "8600": 11.89715, - "8605": 11.93644, - "8610": 11.98611, - "8615": 11.91557, - "8620": 11.92076, - "8625": 11.96113, - "8630": 11.99266, - "8635": 11.93916, - "8640": 12.02781, - "8645": 11.99006, - "8650": 11.91164, - "8655": 11.91924, - "8660": 11.95194, - "8665": 12.00021, - "8670": 11.90972, - "8675": 11.96086, - "8680": 11.95175, - "8685": 11.95495, - "8690": 12.00198, - "8695": 12.07659, - "8700": 11.96371, - "8705": 11.91845, - "8710": 11.97745, - "8715": 11.93805, - "8720": 11.9173, - "8725": 11.91035, - "8730": 12.01393, - "8735": 11.98447, - "8740": 11.97475, - "8745": 11.96291, - "8750": 11.9361, - "8755": 11.96838, - "8760": 11.93695, - "8765": 12.00162, - "8770": 11.92599, - "8775": 12.0012, - "8780": 12.03738, - "8785": 11.94909, - "8790": 11.90577, - "8795": 11.97012, - "8800": 11.93035, - "8805": 11.99893, - "8810": 11.94421, - "8815": 11.98191, - "8820": 11.99062, - "8825": 11.92267, - "8830": 11.95194, - "8835": 11.937, - "8840": 11.97075, - "8845": 11.95007, - "8850": 12.02522, - "8855": 11.94712, - "8860": 11.96728, - "8865": 11.89285, - "8870": 11.94189, - "8875": 11.92065, - "8880": 11.98822, - "8885": 11.98285, - "8890": 11.99582, - "8895": 11.96596, - "8900": 11.94354, - "8905": 11.95473, - "8910": 11.99259, - "8915": 11.96618, - "8920": 11.93587, - "8925": 11.99413, - "8930": 12.00638, - "8935": 11.93, - "8940": 11.95031, - "8945": 11.91928, - "8950": 11.9941, - "8955": 11.94031, - "8960": 11.96914, - "8965": 11.95062, - "8970": 11.95268, - "8975": 12.03161, - "8980": 11.97245, - "8985": 12.01027, - "8990": 11.9446, - "8995": 11.96843, - "9000": 11.9429, - "9005": 11.94091, - "9010": 11.93667, - "9015": 11.95344, - "9020": 11.93207, - "9025": 11.91998, - "9030": 11.92651, - "9035": 11.97131, - "9040": 11.92008, - "9045": 11.9777, - "9050": 11.93287, - "9055": 11.96682, - "9060": 11.982, - "9065": 11.9763, - "9070": 11.92703, - "9075": 11.95149, - "9080": 11.94863, - "9085": 11.92217, - "9090": 11.92326, - "9095": 11.9586, - "9100": 11.93403, - "9105": 11.97708, - "9110": 11.97248, - "9115": 11.91899, - "9120": 11.98175, - "9125": 12.0043, - "9130": 11.98361, - "9135": 11.95811, - "9140": 11.89116, - "9145": 11.92833, - "9150": 11.96999, - "9155": 11.95682, - "9160": 11.93898, - "9165": 11.98676, - "9170": 11.96776, - "9175": 11.91735, - "9180": 11.96488, - "9185": 11.93801, - "9190": 11.93829, - "9195": 11.96444, - "9200": 11.91924, - "9205": 11.99554, - "9210": 11.91977, - "9215": 11.99739, - "9220": 11.92053, - "9225": 11.93702, - "9230": 11.95815, - "9235": 12.05346, - "9240": 11.9596, - "9245": 11.97173, - "9250": 11.94092, - "9255": 11.94632, - "9260": 12.00354, - "9265": 11.96854, - "9270": 11.91621, - "9275": 11.94709, - "9280": 11.93375, - "9285": 11.92465, - "9290": 11.93047, - "9295": 11.93184, - "9300": 11.95538, - "9305": 11.96102, - "9310": 11.93874, - "9315": 11.94123, - "9320": 11.95854, - "9325": 11.98961, - "9330": 11.87394, - "9335": 11.97986, - "9340": 12.02583, - "9345": 11.94202, - "9350": 12.00113, - "9355": 11.97405, - "9360": 11.96746, - "9365": 11.96018, - "9370": 11.9475, - "9375": 11.94327, - "9380": 11.92135, - "9385": 12.01574, - "9390": 11.95494, - "9395": 11.93529, - "9400": 11.96463, - "9405": 11.9807, - "9410": 11.92926, - "9415": 11.95919, - "9420": 11.94796, - "9425": 11.94261, - "9430": 11.94968, - "9435": 11.9655, - "9440": 11.94016, - "9445": 11.98541, - "9450": 11.94602, - "9455": 11.96365, - "9460": 11.9884, - "9465": 11.93962, - "9470": 11.93471, - "9475": 11.91073, - "9480": 11.92557, - "9485": 11.93537, - "9490": 11.97267, - "9495": 11.93521, - "9500": 11.92542, - "9505": 12.00627, - "9510": 11.9749, - "9515": 11.97511, - "9520": 11.88493, - "9525": 11.91739, - "9530": 11.92418, - "9535": 11.97024 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml similarity index 100% rename from tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_gb_200_release_sm/model_config.yml rename to tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml similarity index 100% rename from tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml rename to tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml From 2fadde8ac34de3f046f68af9d761249d7d38c74f Mon Sep 17 00:00:00 2001 From: Jimmy Zhang <133159885+jiemingz@users.noreply.github.com> Date: Sun, 1 Feb 2026 03:32:53 -0500 Subject: [PATCH 018/231] Fix missing argument in MoELayer.forward() (#3133) Signed-off-by: Jimmy Zhang --- megatron/core/transformer/moe/moe_layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 990d13b98d9..5cfea1e8ae4 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -386,7 +386,7 @@ def forward( padding_mask = padding_mask.transpose(0, 1).bool() # MoE forward: route -> dispatch -> compute -> combine - def custom_forward(hidden_states, intermediate_tensors, padding_mask=None): + def custom_forward(hidden_states, intermediate_tensors=None, padding_mask=None): try: if "route" in self.fwd_execution_map: shared_expert_output = self.shared_experts_compute(hidden_states) From ae6707622e94c5083f72c87fd050b69a3a776618 Mon Sep 17 00:00:00 2001 From: tgkyrie <74066353+tgkyrie@users.noreply.github.com> Date: Sun, 1 Feb 2026 21:53:42 +0800 Subject: [PATCH 019/231] Fix H2D stream synchronization in optimizer offload (#3140) Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xin Yao --- .../cpu_offloading/hybrid_optimizer.py | 2 +- .../test_optimizer_cpu_offloading.py | 114 ++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py b/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py index 28487c3b367..c87ccd5ff31 100644 --- a/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +++ b/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py @@ -122,7 +122,7 @@ def param_copy_back_gpu_hook(optimizer, args, kwargs): for param in _param_generator(optimizer): gpu_param = self.cpu_copys_map_gpu_param[param] gpu_param.data.copy_(param.data, non_blocking=True) - self._d2h_stream.record_event().wait(torch.cuda.current_stream()) + self._h2d_stream.record_event().wait(torch.cuda.current_stream()) return param_copy_back_gpu_hook diff --git a/tests/unit_tests/test_optimizer_cpu_offloading.py b/tests/unit_tests/test_optimizer_cpu_offloading.py index 1c367100dab..33febbb3eb0 100644 --- a/tests/unit_tests/test_optimizer_cpu_offloading.py +++ b/tests/unit_tests/test_optimizer_cpu_offloading.py @@ -39,6 +39,28 @@ def forward(self, x): return x +class BigNet(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 2048) + self.fc2 = nn.Linear(2048, 8192) + self.fc3 = nn.Linear(8192, 2048) + self.fc4 = nn.Linear(2048, 100) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = self.fc4(x) + return x + + def setup_seed(seed): random.seed(seed) # Set Python's built-in random seed np.random.seed(seed) # Set NumPy's random seed @@ -139,3 +161,95 @@ def test_multi_device_hybrid_optimizer( assert torch.allclose( v, ref_params[k], atol=1e-03 ), f"Weight {k} value mismatch, max error: {(v - ref_params[k]).abs().max()}" + + +@pytest.mark.skipif( + torch.__version__ < '2.3.0', + reason=( + "Requires PyTorch 2.3.0 or higher, lower versions of pytorch have " + "misaligned optimizer accuracy for CPU and GPU." + ), +) +@pytest.mark.parametrize('n_steps', [1, 10]) +@pytest.mark.parametrize('offload_fraction', [1, 0.5, 0]) +@pytest.mark.parametrize('optimizer', ['adam', 'sgd']) +@pytest.mark.parametrize('with_param_groups', [False, True]) +def test_overlap_cpu_optimizer_d2h_h2d_sync_correctness( + with_param_groups, optimizer, offload_fraction, n_steps +): + setup_seed(42) + net1 = BigNet().cuda() + net2 = BigNet().cuda() + net2.load_state_dict(net1.state_dict()) + base_lr = 1e-3 + params = list(net1.parameters()) + ref_params = list(net2.parameters()) + if with_param_groups: + param_groups = [ + {"params": params[: len(params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4}, + {"params": params[len(params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4}, + ] + params = param_groups + ref_param_groups = [ + {"params": ref_params[: len(ref_params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4}, + {"params": ref_params[len(ref_params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4}, + ] + ref_params = ref_param_groups + + if optimizer == 'adam': + cls_kwargs = dict(cpu_optimizer_cls=Adam, gpu_optimizer_cls=GPUAdam) + else: + cls_kwargs = dict(cpu_optimizer_cls=SGD, gpu_optimizer_cls=GPUSGD) + + hdo = HybridDeviceOptimizer( + params, + offload_fraction=offload_fraction, + lr=base_lr, + overlap_cpu_optimizer_d2h_h2d=True, + **cls_kwargs, + ) + + ref_optimizer = cls_kwargs['gpu_optimizer_cls'](ref_params, lr=base_lr) + + # 1. run step on optimizer, make sure there is state generated + assert len(hdo.state_dict()["state"]) == 0 # state is empty + input = torch.randn(1, 3, 32, 32).cuda() + output = net1(input) + output.sum().backward() + hdo.step() + output = net2(input) + output.sum().backward() + ref_optimizer.step() + # PyTorch SGD will not generate state + if optimizer != 'sgd': + assert len(hdo.state_dict()["state"]) != 0 + + # 2. check the state is on right device + if optimizer == 'adam': + first_param_id = hdo.state_dict()["param_groups"][0]["params"][0] + last_param_id = hdo.state_dict()["param_groups"][-1]["params"][-1] + if offload_fraction > 0: + assert not hdo.state_dict()["state"][first_param_id]["exp_avg"].is_cuda + if offload_fraction < 1: + assert hdo.state_dict()["state"][last_param_id]["exp_avg"].is_cuda + + inputs = [torch.randn(1, 3, 32, 32).cuda() for _ in range(1, n_steps)] + for i in range(1, n_steps): + output = net1(inputs[i - 1]) + output.sum().backward() + hdo.step() + + for i in range(1, n_steps): + output = net2(inputs[i - 1]) + output.sum().backward() + ref_optimizer.step() + + params = net1.state_dict() + ref_params = net2.state_dict() + for k, v in params.items(): + assert (v.isnan() == ref_params[k].isnan()).all() + torch.nan_to_num_(v, 0) + torch.nan_to_num_(ref_params[k], 0) + assert torch.allclose( + v, ref_params[k], atol=1e-03 + ), f"Weight {k} value mismatch, max error: {(v - ref_params[k]).abs().max()}" From 300d1b6550b46b2ce572f78e1c45f5ac2acb7d7f Mon Sep 17 00:00:00 2001 From: rkarimimahab Date: Sun, 1 Feb 2026 22:48:22 +0100 Subject: [PATCH 020/231] Add MTP support for hybrid models (#2363) Co-authored-by: Rabeeh Mahabadi Co-authored-by: Sanjeev Satheesh Co-authored-by: Deepak Narayanan --- mamba_builders.py | 8 +- .../common/language_module/language_module.py | 30 +- .../common/model_chunk_schedule_plan.py | 2 +- .../core/models/gpt/fine_grained_callables.py | 4 +- megatron/core/models/gpt/gpt_layer_specs.py | 2 +- megatron/core/models/gpt/gpt_model.py | 112 ++----- .../core/models/mamba/mamba_layer_specs.py | 33 ++ megatron/core/models/mamba/mamba_model.py | 91 ++++- megatron/core/pipeline_parallel/schedules.py | 5 +- megatron/core/ssm/mamba_block.py | 28 +- .../core/ssm/mamba_hybrid_layer_allocation.py | 149 ++++++++- megatron/core/transformer/moe/moe_layer.py | 12 +- megatron/core/transformer/moe/router.py | 51 ++- .../transformer/multi_token_prediction.py | 314 +++++++++++++++--- .../core/transformer/transformer_config.py | 9 + .../core/transformer/transformer_layer.py | 18 +- megatron/training/arguments.py | 73 ++++ megatron/training/checkpointing.py | 6 + megatron/training/training.py | 23 +- pretrain_mamba.py | 1 + .../unit_tests/models/test_mamba_moe_model.py | 2 + .../ssm/test_mamba_hybrid_layer_allocation.py | 139 +++++++- .../test_multi_token_prediction.py | 263 ++++++++++++++- 23 files changed, 1170 insertions(+), 205 deletions(-) diff --git a/mamba_builders.py b/mamba_builders.py index 6a792ba6ea5..5d31af60475 100644 --- a/mamba_builders.py +++ b/mamba_builders.py @@ -8,6 +8,7 @@ from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec + def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None): print_rank_0('building MAMBA model ...') if config is None: @@ -15,8 +16,10 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p assert args.use_legacy_models is False, "Mamba only supported in Mcore!" if config.transformer_impl == "inference_optimized": - mamba_stack_spec = mamba_inference_stack_spec - assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba" + mamba_stack_spec = mamba_inference_stack_spec + assert ( + not config.inference_fuse_tp_communication + ), "inference_fuse_tp_communication is not supported for Mamba" elif args.spec is not None: mamba_stack_spec = import_module(args.spec) else: @@ -39,6 +42,7 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p rotary_percent=args.rotary_percent, rotary_base=args.rotary_base, pg_collection=pg_collection, + vp_stage=vp_stage, ) for l in range(model.decoder.num_layers_per_pipeline_rank): diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index b0fa6126b63..57975b2958b 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -23,6 +23,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group from megatron.core.utils import ( @@ -255,12 +256,20 @@ def setup_embeddings_and_output_layer(self) -> None: LanguageModule.embedding_warning_printed = True def shared_embedding_or_output_weight(self) -> Tensor: - """Gets the emedding weight or output logit weights when share embedding and output weights set to True. + """Gets the embedding weight or output logit weights when share embedding and output weights set to True + or when use Multi-Token Prediction (MTP). Returns: - Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight + Tensor: During pre processing or MTP process it returns the input embeddings weight while during post processing it returns the final output layers weight """ - if self.pre_process: + if self.pre_process or getattr(self, 'mtp_process', False): + # Multi-Token Prediction (MTP) need both embedding layer and output layer. + # So there will be both embedding layer and output layer in the mtp process stage. + # When share_embeddings_and_output_weights is True, the embedding weight is the + # canonical shared weight and is passed to the output layer during forward. + assert hasattr( + self, 'embedding' + ), f"embedding is needed in this pipeline stage, but it is not initialized." return self.embedding.word_embeddings.weight elif self.post_process: return self.output_layer.weight @@ -293,6 +302,21 @@ def sharded_state_dict( output_layer_weight_key = f'{prefix}output_layer.weight' output_layer_bias_key = f'{prefix}output_layer.bias' + # Multi-Token Prediction (MTP) needs embedding layer in mtp process stage. + # If MTP is not placed in the pre processing stage, we need to maintain a copy of + # embedding layer in the mtp process stage and tie it to the embedding in the pre + # processing stage. + # Note: MTP loss is computed at post_process stage, so the output_layer on mtp_process + # rank doesn't need special tying - it's not used for loss computation. + if getattr(self, 'mtp_process', False) and not self.pre_process: + emb_weight = self.embedding.word_embeddings.weight + tie_word_embeddings_state_dict( + sharded_state_dict, + emb_weight, + first_stage_word_emb_key, + tp_group=self.tp_group, + dp_cp_group=metadata['dp_cp_group'], + ) if self.share_embeddings_and_output_weights: self.tie_embeddings_and_output_weights_state_dict( sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 033e8e808f9..3b0e3a13b76 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): # get flags for latter use is_mtp = isinstance(self.layer, MultiTokenPredictionLayer) is_moe = ( - isinstance(self.layer.transformer_layer.mlp, MoELayer) + isinstance(self.layer.mtp_model_layer.mlp, MoELayer) if is_mtp else isinstance(self.layer.mlp, MoELayer) ) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 7cee9d2973c..e17ed0a5d40 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -613,9 +613,9 @@ def build_mtp_layer_callables(layer): multi-token prediction layer nodes (attention, MLP, etc.) """ - forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) + forward_funcs, backward_dw = build_transformer_layer_callables(layer.mtp_model_layer) attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs - is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) + is_moe = isinstance(layer.mtp_model_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." def submodule_mtp_attn_forward(node, hidden_states): diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 49501ee54eb..bebb4350d27 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend( raise ValueError(f"Invalid spec: {spec}") mtp_layer_spec = get_mtp_layer_spec_for_backend( - transformer_layer_spec=transformer_layer_spec, backend=backend + mtp_model_layer_spec=transformer_layer_spec, backend=backend ) mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0 mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index e287344c13d..4b96465a31e 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -6,7 +6,7 @@ import torch from torch import Tensor -from megatron.core import parallel_state, tensor_parallel +from megatron.core import tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.dist_checkpointing.mapping import ShardedStateDict from megatron.core.inference.contexts import BaseInferenceContext @@ -26,11 +26,9 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer.enums import CudaGraphScope, ModelType from megatron.core.transformer.multi_token_prediction import ( - MTPLossAutoScaler, - MTPLossLoggingHelper, MultiTokenPredictionBlock, - roll_tensor, - tie_word_embeddings_state_dict, + mtp_on_this_rank, + process_mtp_loss, ) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlock @@ -144,7 +142,9 @@ def __init__( self.rotary_base = rotary_base self.rotary_scaling = rope_scaling self.mtp_block_spec = mtp_block_spec - self.mtp_process = mtp_block_spec is not None + self.mtp_process = mtp_block_spec is not None and mtp_on_this_rank( + self.config, ignore_virtual=False, vp_stage=vp_stage + ) if self.pre_process or self.mtp_process: self.embedding = LanguageModelEmbedding( @@ -609,56 +609,19 @@ def _postprocess( return hidden_states if self.config.mtp_num_layers is not None: - mtp_labels = labels.clone() - hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0) - hidden_states = hidden_states_list[0] - if loss_mask is None: - # if loss_mask is not provided, use all ones as loss_mask - loss_mask = torch.ones_like(mtp_labels) - for mtp_layer_number in range(self.config.mtp_num_layers): - # output - mtp_logits, _ = self.output_layer( - hidden_states_list[mtp_layer_number + 1], - weight=output_weight, - runtime_gather_output=runtime_gather_output, - ) - # Calc loss for the current Multi-Token Prediction (MTP) layers. - mtp_labels, _ = roll_tensor( - mtp_labels, - shifts=-1, - dims=-1, - cp_group=self.cp_group, - packed_seq_params=packed_seq_params, - ) - loss_mask, num_tokens = roll_tensor( - loss_mask, - shifts=-1, - dims=-1, - cp_group=self.cp_group, - packed_seq_params=packed_seq_params, - ) - mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits) - mtp_loss = loss_mask * mtp_loss - if self.training: - # TODO(shifangx): remove the use of parallel_state here - # after moving loss logging to loss_func in pretrain_gpt.py - MTPLossLoggingHelper.save_loss_to_tracker( - torch.sum(mtp_loss) / num_tokens, - mtp_layer_number, - self.config.mtp_num_layers, - avg_group=parallel_state.get_data_parallel_group( - with_context_parallel=True - ), - ) - mtp_loss_scale = self.config.mtp_loss_scaling_factor / self.config.mtp_num_layers - if self.config.calculate_per_token_loss: - hidden_states = MTPLossAutoScaler.apply( - hidden_states, mtp_loss_scale * mtp_loss - ) - else: - hidden_states = MTPLossAutoScaler.apply( - hidden_states, mtp_loss_scale * mtp_loss / num_tokens - ) + hidden_states = process_mtp_loss( + hidden_states=hidden_states, + labels=labels, + loss_mask=loss_mask, + output_layer=self.output_layer, + output_weight=output_weight, + runtime_gather_output=runtime_gather_output, + is_training=self.training, + compute_language_model_loss=self.compute_language_model_loss, + config=self.config, + cp_group=self.pg_collection.cp, + packed_seq_params=packed_seq_params, + ) sequence_parallel_override = False if in_inference_mode and inference_context.materialize_only_last_token_logits: @@ -715,27 +678,6 @@ def _postprocess( return loss - def shared_embedding_or_output_weight(self) -> Tensor: - """Gets the embedding weight or output logit weights when share input embedding and - output weights set to True or when use Multi-Token Prediction (MTP) feature. - - Returns: - Tensor: During pre processing or MTP process it returns the input embeddings weight. - Otherwise, during post processing it returns the final output layers weight. - """ - if self.pre_process or self.mtp_process: - # Multi-Token Prediction (MTP) need both embedding layer and output layer. - # So there will be both embedding layer and output layer in the mtp process stage. - # In this case, if share_embeddings_and_output_weights is True, the shared weights - # will be stored in embedding layer, and output layer will not have any weight. - assert hasattr( - self, 'embedding' - ), f"embedding is needed in this pipeline stage, but it is not initialized." - return self.embedding.word_embeddings.weight - elif self.post_process: - return self.output_layer.weight - return None - def build_schedule_plan( self, input_ids: Tensor, @@ -826,20 +768,4 @@ def sharded_state_dict( output_extra_state and output_extra_state.data ), f'Expected output layer extra state to be empty, got: {output_extra_state}' - # Multi-Token Prediction (MTP) need embedding layer in mtp process stage. - # If MTP is not placed in the pre processing stage, we need to maintain a copy of - # embedding layer in the mtp process stage and tie it to the embedding in the pre - # processing stage. - # Now MTP loss is computed in post processing stage, so the output_layer is not needed. - if self.mtp_process and not self.pre_process: - emb_weight_key = f'{prefix}embedding.word_embeddings.weight' - emb_weight = self.embedding.word_embeddings.weight - tie_word_embeddings_state_dict( - sharded_state_dict, - emb_weight, - emb_weight_key, - tp_group=self.tp_group, - dp_cp_group=metadata['dp_cp_group'], - ) - return sharded_state_dict diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py index b87124bab1d..6ca628475be 100755 --- a/megatron/core/models/mamba/mamba_layer_specs.py +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. from megatron.core.extensions.transformer_engine import ( + TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, @@ -19,6 +20,12 @@ from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.multi_token_prediction import ( + MultiTokenPredictionBlock, + MultiTokenPredictionBlockSubmodules, + MultiTokenPredictionLayer, + MultiTokenPredictionLayerSubmodules, +) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import ( MoETransformerLayer, @@ -26,6 +33,7 @@ TransformerLayerSubmodules, ) +# This should be private and should not be used outside of this file. moe = get_moe_module_spec( use_te=True, num_experts=8, # Can be any positive integer (must not be None). @@ -33,6 +41,28 @@ moe_use_legacy_grouped_gemm=False, ) + +# MTP block spec for Mamba - provides norms and projection only. +# Inner layers are built by MultiTokenPredictionLayer using nested MambaStack +_mamba_mtp_block_spec = ModuleSpec( + module=MultiTokenPredictionBlock, + submodules=MultiTokenPredictionBlockSubmodules( + layer_specs=[ + ModuleSpec( + module=MultiTokenPredictionLayer, + submodules=MultiTokenPredictionLayerSubmodules( + enorm=TENorm, + hnorm=TENorm, + eh_proj=TEColumnParallelLinear, + mtp_model_layer=None, # Built via pattern + mamba_submodules + layer_norm=TENorm, + ), + ) + ] + ), +) + + mamba_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -87,9 +117,11 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), + mtp_block_spec=_mamba_mtp_block_spec, ), ) + mamba_inference_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -147,5 +179,6 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), + mtp_block_spec=_mamba_mtp_block_spec, ), ) diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 8d45e1d0147..0a783391437 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -16,6 +16,11 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.multi_token_prediction import ( + MultiTokenPredictionBlock, + mtp_on_this_rank, + process_mtp_loss, +) from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.utils import ( WrappedTensor, @@ -38,7 +43,11 @@ class MambaModel(LanguageModule): hybrid_attention_ratio (float, optional): The target ratio of attention layers to total layers hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers - hybrid_override_pattern (str, optional): The hybrid layer pattern to override with + hybrid_override_pattern (str, optional): Unified hybrid layer pattern with optional MTP. + Format: "///..." + Examples: + - "M*M*" -> main decoder only, no MTP + - "M*M*/MM/MM" -> main="M*M*", mtp="MM", 2 depths post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True. fp16_lm_cross_entropy (bool, optional): Defaults to False. @@ -79,6 +88,7 @@ def __init__( scatter_embedding_sequence_parallel: bool = True, seq_len_interpolation_factor: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, + vp_stage: Optional[int] = None, ) -> None: super().__init__(config=config, pg_collection=pg_collection) @@ -97,12 +107,27 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.position_embedding_type = position_embedding_type + self.vp_stage = vp_stage + + # Parse unified pattern to extract main and MTP components + from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern + + parsed = parse_hybrid_pattern(hybrid_override_pattern) + self.mtp_pattern = parsed.mtp_pattern + self.mtp_num_depths = parsed.mtp_num_depths + + # Determine if MTP is needed (based on pattern parsing) + self.mtp_process = ( + self.mtp_pattern is not None + and self.mtp_num_depths > 0 + and mtp_on_this_rank(self.config, vp_stage=self.vp_stage) + ) # megatron core pipelining currently depends on model type # TODO: remove this dependency ? self.model_type = ModelType.encoder_or_decoder - if self.pre_process: + if self.pre_process or self.mtp_process: self.embedding = LanguageModelEmbedding( config=self.config, vocab_size=self.vocab_size, @@ -128,14 +153,33 @@ def __init__( pre_process=self.pre_process, hybrid_attention_ratio=self.hybrid_attention_ratio, hybrid_mlp_ratio=self.hybrid_mlp_ratio, - hybrid_override_pattern=self.hybrid_override_pattern, + hybrid_override_pattern=parsed.main_pattern, post_process=self.post_process, dtype=config.params_dtype, pg_collection=self.pg_collection, ) + # MTP block - uses mtp_block_spec from mamba_stack_spec.submodules + if self.mtp_process: + mamba_submodules = mamba_stack_spec.submodules + mtp_block_spec = mamba_submodules.mtp_block_spec + assert mtp_block_spec is not None, ( + "MTP pattern specified but mtp_block_spec is None in mamba_stack_spec.submodules. " + "Ensure mamba_stack_spec includes mtp_block_spec for MTP support." + ) + + self.mtp = MultiTokenPredictionBlock( + config=self.config, + spec=mtp_block_spec, + pg_collection=self.pg_collection, + vp_stage=self.vp_stage, + mtp_layer_pattern=self.mtp_pattern, + mtp_num_depths=self.mtp_num_depths, + mamba_submodules=mamba_submodules, + ) + # Output - if post_process: + if post_process or self.mtp_process: self.output_layer = tensor_parallel.ColumnParallelLinear( config.hidden_size, self.vocab_size, @@ -149,7 +193,7 @@ def __init__( tp_group=self.pg_collection.tp, ) - if self.pre_process or self.post_process: + if self.pre_process or self.post_process or self.mtp_process: self.setup_embeddings_and_output_layer() for name, module in self.named_modules(): @@ -184,6 +228,7 @@ def forward( runtime_gather_output: Optional[bool] = None, *, inference_params: Optional[BaseInferenceContext] = None, + loss_mask: Optional[Tensor] = None, packed_seq_params: Optional[PackedSeqParams] = None, padding_mask: Optional[Tensor] = None, ) -> Tensor: @@ -258,14 +303,40 @@ def forward( padding_mask=padding_mask, ) - if not self.post_process: - return hidden_states - - # logits and loss output_weight = None if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() + if self.mtp_process: + hidden_states = self.mtp( + input_ids=input_ids, + position_ids=position_ids, + hidden_states=hidden_states, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + packed_seq_params=packed_seq_params, + embedding=self.embedding, + ) + + if not self.post_process: + return hidden_states + + if self.config.mtp_num_layers is not None: + hidden_states = process_mtp_loss( + hidden_states=hidden_states, + labels=labels, + loss_mask=loss_mask, + output_layer=self.output_layer, + output_weight=output_weight, + runtime_gather_output=runtime_gather_output, + is_training=self.training, + compute_language_model_loss=self.compute_language_model_loss, + config=self.config, + cp_group=self.pg_collection.cp, + packed_seq_params=packed_seq_params, + ) + sequence_parallel_override = False if in_inference_mode and inference_context.materialize_only_last_token_logits: if inference_context.is_static_batching(): @@ -281,7 +352,7 @@ def forward( self.output_layer.sequence_parallel = False sequence_parallel_override = True - # Reshape [B, 1, H] to [1, B, H] → extract each sample’s true last‐token hidden + # Reshape [B, 1, H] to [1, B, H] → extract each sample's true last‐token hidden # state ([B, H]) → unsqueeze back to [B, 1, H] # (so that the output layer, which expects S×B×H, receives only the final token) hidden_states = inference_context.last_token_logits( diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index edca62be375..15c5adfc7a2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -212,7 +212,10 @@ def set_current_microbatch(model, microbatch_id): layer.current_microbatch = microbatch_id if hasattr(model_with_decoder, 'mtp'): for layer in model_with_decoder.mtp.layers: - layer.transformer_layer.current_microbatch = microbatch_id + assert hasattr( + layer, 'mtp_model_layer' + ), f"MTP layer {layer} must have 'mtp_model_layer' attribute" + layer.mtp_model_layer.current_microbatch = microbatch_id def forward_step_calc_loss( diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index ef41faae143..ffb7b8f6fdb 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -42,6 +42,7 @@ class MambaStackSubmodules: attention_layer: Union[ModuleSpec, type] = IdentityOp mlp_layer: Union[ModuleSpec, type] = IdentityOp moe_layer: Union[ModuleSpec, type] = IdentityOp + mtp_block_spec: Optional[ModuleSpec] = None class MambaStack(MegatronModule): @@ -85,12 +86,14 @@ def __init__( device=None, dtype=None, pg_collection: ProcessGroupCollection = None, + is_mtp_layer: bool = False, ) -> None: super().__init__(config=config) self.residual_in_fp32 = residual_in_fp32 self.pre_process = pre_process self.post_layer_norm = post_layer_norm self.post_process = post_process + self.is_mtp_layer = is_mtp_layer assert pg_collection is not None, "pg_collection must be provided for MambaStack" @@ -103,20 +106,32 @@ def __init__( self.hybrid_attention_ratio = hybrid_attention_ratio self.hybrid_mlp_ratio = hybrid_mlp_ratio self.hybrid_override_pattern = hybrid_override_pattern + self.pg_collection = pg_collection + + # For MTP layers, always use pattern length (config.num_layers is for main decoder) + if self.is_mtp_layer: + num_layers_for_allocation = len(self.hybrid_override_pattern) + else: + num_layers_for_allocation = ( + self.config.num_layers + if self.config.num_layers is not None + else len(self.hybrid_override_pattern) + ) self.layer_type_list = allocate_layers( - self.config.num_layers, + num_layers_for_allocation, self.hybrid_attention_ratio, self.hybrid_mlp_ratio, self.hybrid_override_pattern, + silent=self.is_mtp_layer, ) pp_layer_offset = 0 - if self.pp_group.size() > 1: + if self.pp_group.size() > 1 and not self.is_mtp_layer: pp_layer_offset, self.layer_type_list = self._select_layers_for_pipeline_parallel( self.layer_type_list ) - + # Build main decoder layers using shared layer builder self.layers = nn.ModuleList() for i, layer_type in enumerate(self.layer_type_list): fp8_init_context = get_fp8_context(self.config, i + pp_layer_offset, is_init=True) @@ -137,9 +152,10 @@ def __init__( config=self.config, layer_number=i + 1, pg_collection=pg_collection, + is_mtp_layer=is_mtp_layer, ) elif layer_type == LayerSymbols.MLP: - # Transformer layers apply their own pp_layer_offset + # MLP layers apply their own pp_layer_offset layer = build_module( submodules.mlp_layer, config=self.config, @@ -147,7 +163,7 @@ def __init__( pg_collection=pg_collection, ) elif layer_type == LayerSymbols.MOE: - # Transformer layers apply their own pp_layer_offset + # MoE layers apply their own pp_layer_offset layer = build_module( submodules.moe_layer, config=self.config, @@ -316,7 +332,7 @@ def forward( # Ensure that the tensor passed between pipeline parallel stages is # viewless. See related notes in TransformerBlock and TransformerLayer - output = make_viewless_tensor( + hidden_states = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py index fe997e2249a..d7002b2915d 100644 --- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import logging -from typing import Dict, List, Tuple +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple if __name__ != "__main__": from megatron.core.utils import log_single_rank @@ -29,9 +30,129 @@ class Symbols: ATTENTION = "*" MLP = "-" MOE = 'E' + MTP_SEPARATOR = "/" VALID = {MAMBA, ATTENTION, MLP, MOE} +@dataclass +class ParsedHybridPattern: + """Result of parsing a unified hybrid pattern string. + + A unified pattern encodes both the main decoder pattern and the MTP pattern + in a single string using "/" as a separator. + + Format: "///..." + + Examples: + - "M*M*" -> main="M*M*", mtp=None, depths=0 (no MTP) + - "M*M*/MM/MM" -> main="M*M*", mtp="MM", depths=2 + - "MMMM/*M/*M/*M" -> main="MMMM", mtp="*M", depths=3 + + The "/" symbol introduces MTP patterns. Each repeated pattern after the main + decoder represents one MTP prediction depth. + + Attributes: + main_pattern: The main decoder layer pattern (e.g., "M*M*") + mtp_pattern: The MTP layer pattern per depth (e.g., "MM"), or None if no MTP + mtp_num_depths: Number of MTP prediction depths (0 if no MTP) + """ + + main_pattern: Optional[str] + mtp_pattern: Optional[str] + mtp_num_depths: int + + +def parse_hybrid_pattern(pattern: Optional[str]) -> ParsedHybridPattern: + """Parse a unified hybrid pattern string into main and MTP components. + + The pattern uses "/" as a separator between the main decoder pattern and + MTP patterns. Each MTP pattern after the separator represents one prediction + depth. + + Format: "///..." + + Args: + pattern: Unified pattern string, e.g., "M*M*/MM/MM" or just "M*M*" + + Returns: + ParsedHybridPattern with main_pattern, mtp_pattern, and mtp_num_depths + + Raises: + ValueError: If MTP patterns are inconsistent (all must be identical) + ValueError: If pattern contains invalid layer symbols + + Examples: + >>> parse_hybrid_pattern("M*M*") + ParsedHybridPattern(main_pattern="M*M*", mtp_pattern=None, mtp_num_depths=0) + + >>> parse_hybrid_pattern("M*M*/MM/MM") + ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) + + >>> parse_hybrid_pattern("MMMM/*M/*M/*M") + ParsedHybridPattern(main_pattern="MMMM", mtp_pattern="*M", mtp_num_depths=3) + """ + if pattern is None: + return ParsedHybridPattern(main_pattern=None, mtp_pattern=None, mtp_num_depths=0) + + parts = pattern.split(Symbols.MTP_SEPARATOR) + + if len(parts) == 1: + # No MTP separator found - pattern is main decoder only + main_pattern = parts[0] + _validate_pattern(main_pattern, "main") + return ParsedHybridPattern(main_pattern=main_pattern, mtp_pattern=None, mtp_num_depths=0) + + # First part is main decoder pattern + main_pattern = parts[0] + if main_pattern: + _validate_pattern(main_pattern, "main") + + # Remaining parts are MTP patterns (one per depth) + mtp_parts = parts[1:] + + if not mtp_parts or all(p == "" for p in mtp_parts): + # No MTP patterns after separator + return ParsedHybridPattern( + main_pattern=main_pattern if main_pattern else None, mtp_pattern=None, mtp_num_depths=0 + ) + + # Validate all MTP patterns are identical + mtp_pattern = mtp_parts[0] + for i, part in enumerate(mtp_parts[1:], start=2): + if part != mtp_pattern: + raise ValueError( + f"All MTP patterns must be identical. " + f"Pattern 1 is '{mtp_pattern}', but pattern {i} is '{part}'. " + f"Full pattern: '{pattern}'" + ) + + _validate_pattern(mtp_pattern, "MTP") + + return ParsedHybridPattern( + main_pattern=main_pattern if main_pattern else None, + mtp_pattern=mtp_pattern, + mtp_num_depths=len(mtp_parts), + ) + + +def _validate_pattern(pattern: str, pattern_name: str) -> None: + """Validate that a pattern contains only valid layer symbols. + + Args: + pattern: Layer pattern string to validate + pattern_name: Name of pattern for error messages (e.g., "main" or "MTP") + + Raises: + ValueError: If pattern contains invalid symbols + """ + for char in pattern: + if char not in Symbols.VALID: + raise ValueError( + f"In {pattern_name} pattern, '{char}' is not a valid layer symbol. " + f"Valid symbols are: {Symbols.VALID}" + ) + + def _allocate_auto( total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float ) -> list: @@ -97,19 +218,21 @@ def allocate_layers( target_attention_ratio: float, target_mlp_ratio: float, override_pattern: str = None, + silent: bool = False, ) -> list: """Allocates layers according to the requested distribution of layer types.""" assert total_layers_count > 0 assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0 assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0 assert target_attention_ratio + target_mlp_ratio <= 1.0 + maybe_log_single_rank = (lambda *args, **kwargs: None) if silent else log_single_rank # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio) if override_pattern is not None: layer_type_list_override = _allocate_override(total_layers_count, override_pattern) - log_single_rank(logger, logging.INFO, "Using hybrid override pattern") + maybe_log_single_rank(logger, logging.INFO, "Using hybrid override pattern") if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match( layer_type_list_override, layer_type_list ): @@ -119,13 +242,15 @@ def allocate_layers( "pattern." ) if layer_type_list_override == layer_type_list: - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, "The override pattern matches the overridden pattern" ) else: - log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B") - log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") - log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") + maybe_log_single_rank( + logger, logging.INFO, "Warning: overriding pattern A with pattern B" + ) + maybe_log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") + maybe_log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") layer_type_list = layer_type_list_override if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None: @@ -134,32 +259,32 @@ def allocate_layers( actual_mlp_layers_count = layer_type_list.count(Symbols.MLP) actual_mlp_ratio = actual_mlp_layers_count / total_layers_count allocation_string = "".join(layer_type_list) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Hybrid allocation ({Symbols.MAMBA} is mamba, " f"{Symbols.ATTENTION} is attention, " f"{Symbols.MLP} is mlp):", ) - log_single_rank(logger, logging.INFO, allocation_string) - log_single_rank( + maybe_log_single_rank(logger, logging.INFO, allocation_string) + maybe_log_single_rank( logger, logging.INFO, f"{actual_attention_layers_count} attention layers in " f"{total_layers_count} total layers.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Target attention ratio: {target_attention_ratio:.2f}. " f"Actual attention ratio: {actual_attention_ratio:.2f}.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Target mlp ratio: {target_mlp_ratio:.2f}. " diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 5cfea1e8ae4..e5decdfb29b 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -87,10 +87,12 @@ def __init__( config: TransformerConfig, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ): super(BaseMoELayer, self).__init__(config) self.config = config self.layer_number = layer_number + self.is_mtp_layer = is_mtp_layer self.ep_group = pg_collection.ep # use pg_collection.expt_tp_group as tensor parallel group in this module. self.attn_tp_group = pg_collection.tp @@ -140,6 +142,7 @@ def __init__( submodules: Optional[MoESubmodules] = None, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ): self.submodules = submodules # TODO(Hepteract): delete the usage of the global parallel_state. @@ -147,7 +150,10 @@ def __init__( if pg_collection is None: pg_collection = get_default_pg_collection() super(MoELayer, self).__init__( - config=config, layer_number=layer_number, pg_collection=pg_collection + config=config, + layer_number=layer_number, + pg_collection=pg_collection, + is_mtp_layer=is_mtp_layer, ) # If using mcore cudagraphs, recompute is handled by transformer_layer.MoETransformerLayer self.moe_layer_recompute = ( @@ -163,7 +169,9 @@ def __init__( self.tp_group = pg_collection.tp # Initialize router. - self.router = submodules.router(config=self.config, pg_collection=pg_collection) + self.router = submodules.router( + config=self.config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer + ) self.tp_group = pg_collection.tp # Initialize latent projections. diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 4be97401748..e42fd1ca8aa 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -29,7 +29,10 @@ class Router(ABC, MegatronModule): """Base Router class""" def __init__( - self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None + self, + config: TransformerConfig, + pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ) -> None: """ Initialize the Router module. @@ -37,12 +40,14 @@ def __init__( Args: config (TransformerConfig): Configuration object for the Transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. + is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ super().__init__(config) self.config = config self.num_experts = self.config.num_moe_experts self.moe_aux_loss_func = None self.layer_number = None + self.is_mtp_layer = is_mtp_layer self.tp_group = pg_collection.tp self.cp_group = pg_collection.cp self.tp_cp_group = pg_collection.tp_cp @@ -145,15 +150,19 @@ class TopKRouter(Router): """ def __init__( - self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None + self, + config: TransformerConfig, + pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ) -> None: """Initialize the zero token dropping router. Args: config (TransformerConfig): The configuration for the transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. + is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ - super().__init__(config=config, pg_collection=pg_collection) + super().__init__(config=config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer) self.topk = self.config.moe_router_topk self.routing_type = self.config.moe_router_load_balancing_type self.score_function = self.config.moe_router_score_function @@ -438,6 +447,16 @@ def attach_and_log_load_balancing_loss( padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). If None, uses activation.shape[0]. Defaults to None. """ + # When using repeated MTP layers, the loss is counted "mtp_num_layers" times. + # To avoid accumulating the load balancing loss multiple times, we scale it by + # 1/mtp_num_layers so the total loss is correct. + if ( + self.is_mtp_layer + and self.config.mtp_use_repeated_layer + and self.config.mtp_num_layers is not None + ): + aux_loss = aux_loss / self.config.mtp_num_layers + # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly # add the aux loss logging value to other layer's since it is difficult to get the # correct layer_number for MTP. It does not affect the correctness of the calculation @@ -445,10 +464,16 @@ def attach_and_log_load_balancing_loss( num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers + + if self.is_mtp_layer: + layer_number = self.layer_number + self.config.num_layers + else: + layer_number = self.layer_number + save_to_aux_losses_tracker( aux_loss_name, aux_loss / aux_loss_coeff, - self.layer_number, + layer_number, num_layers, reduce_group=reduce_group, reduce_group_has_dp=reduce_group_has_dp, @@ -499,11 +524,27 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) + # When using repeated MTP layers, the same MTP layer is called mtp_num_layers times. + # To avoid accumulating the z_loss multiple times, we scale it by 1/mtp_num_layers + # so the total loss is correct. + if ( + self.is_mtp_layer + and self.config.mtp_use_repeated_layer + and self.config.mtp_num_layers is not None + ): + z_loss = z_loss / self.config.mtp_num_layers + num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers + + if self.is_mtp_layer: + layer_number = self.layer_number + self.config.num_layers + else: + layer_number = self.layer_number + save_to_aux_losses_tracker( - "z_loss", z_loss / moe_z_loss_coeff, self.layer_number, num_layers + "z_loss", z_loss / moe_z_loss_coeff, layer_number, num_layers ) return logits diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 2edb652bfc6..6432af36cde 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -14,6 +14,7 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( gather_from_tensor_model_parallel_region, @@ -24,7 +25,6 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_block import TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.transformer.transformer_layer import get_transformer_layer_offset from megatron.core.utils import ( get_pg_rank, is_torch_min_version, @@ -369,7 +369,7 @@ def track_mtp_metrics(loss_scale, iteration, writer, wandb_writer=None, total_lo mtp_losses = tracker["values"] * loss_scale mtp_num_layers = mtp_losses.shape[0] for i in range(mtp_num_layers): - name = f"mtp_{i+1} loss" + name = f"mtp_{i + 1} loss" loss = mtp_losses[i] if total_loss_dict is not None: if name in total_loss_dict: @@ -396,19 +396,19 @@ class MultiTokenPredictionLayerSubmodules: embedding normalization to be applied. eh_proj (Union[ModuleSpec, type]): Specification or instance of the linear projection to be applied. - transformer_layer (Union[ModuleSpec, type]): Specification - or instance of the transformer block to be applied. + mtp_model_layer (Union[ModuleSpec, type]): Specification + or instance of the transformer or mamba block to be applied. """ enorm: Union[ModuleSpec, type] = None hnorm: Union[ModuleSpec, type] = None eh_proj: Union[ModuleSpec, type] = None - transformer_layer: Union[ModuleSpec, type] = None + mtp_model_layer: Union[ModuleSpec, type] = None layer_norm: Union[ModuleSpec, type] = None def get_mtp_layer_spec( - transformer_layer_spec: ModuleSpec, use_transformer_engine: bool + mtp_model_layer_spec: ModuleSpec, use_transformer_engine: bool ) -> ModuleSpec: """Get the MTP layer spec. @@ -416,13 +416,13 @@ def get_mtp_layer_spec( ModuleSpec: Module specification with TE modules """ return get_mtp_layer_spec_for_backend( - transformer_layer_spec, + mtp_model_layer_spec, backend=TESpecProvider() if use_transformer_engine else LocalSpecProvider(), ) def get_mtp_layer_spec_for_backend( - transformer_layer_spec: ModuleSpec, backend: BackendSpecProvider + mtp_model_layer_spec: ModuleSpec, backend: BackendSpecProvider ) -> ModuleSpec: """Get the MTP layer spec. @@ -437,7 +437,7 @@ def get_mtp_layer_spec_for_backend( enorm=layer_norm_impl, hnorm=layer_norm_impl, eh_proj=column_parallel_linear_impl, - transformer_layer=transformer_layer_spec, + mtp_model_layer=mtp_model_layer_spec, layer_norm=layer_norm_impl, ), ) @@ -586,6 +586,79 @@ def set_loss_scale(scale: torch.Tensor): MTPLossAutoScaler.main_loss_backward_scale = scale +def process_mtp_loss( + hidden_states: Tensor, + labels: Tensor, + loss_mask: Optional[Tensor], + output_layer: Callable, + output_weight: Optional[Tensor], + runtime_gather_output: Optional[bool], + is_training: bool, + compute_language_model_loss: Callable, + config: TransformerConfig, + cp_group: Optional[torch.distributed.ProcessGroup] = None, + packed_seq_params: Optional[PackedSeqParams] = None, +) -> Tensor: + """Process Multi-Token Prediction (MTP) loss computation. + + This is a standalone function that handles MTP loss computation. It's used on the + post_process rank to split concatenated hidden states and compute MTP losses. + + Args: + hidden_states (Tensor): Hidden states tensor (concatenated with MTP outputs). + labels (Tensor): Ground truth labels. + loss_mask (Optional[Tensor]): Mask for loss computation. If None, uses all ones. + output_layer (Callable): Output layer method to compute logits. + output_weight (Optional[Tensor]): Optional output weight for shared embeddings. + runtime_gather_output (Optional[bool]): Whether to gather output at runtime. + is_training (bool): Whether the model is in training mode. + compute_language_model_loss (Callable): Method to compute language model loss. + config (TransformerConfig): Model configuration containing mtp_num_layers etc. + cp_group (Optional[ProcessGroup]): Context parallelism process group. + packed_seq_params (Optional[PackedSeqParams]): Packed sequence parameters. + + Returns: + Tensor: Updated hidden states after MTP loss processing (first chunk only). + """ + mtp_labels = labels.clone() + hidden_states_list = torch.chunk(hidden_states, 1 + config.mtp_num_layers, dim=0) + hidden_states = hidden_states_list[0] + + if loss_mask is None: + loss_mask = torch.ones_like(mtp_labels) + + for mtp_layer_number in range(config.mtp_num_layers): + mtp_logits, _ = output_layer( + hidden_states_list[mtp_layer_number + 1], + weight=output_weight, + runtime_gather_output=runtime_gather_output, + ) + mtp_labels, _ = roll_tensor( + mtp_labels, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + loss_mask, num_tokens = roll_tensor( + loss_mask, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + mtp_loss = compute_language_model_loss(mtp_labels, mtp_logits) + mtp_loss = loss_mask * mtp_loss + if is_training: + MTPLossLoggingHelper.save_loss_to_tracker( + torch.sum(mtp_loss) / num_tokens, + mtp_layer_number, + config.mtp_num_layers, + avg_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + ) + mtp_loss_scale = config.mtp_loss_scaling_factor / config.mtp_num_layers + if config.calculate_per_token_loss: + hidden_states = MTPLossAutoScaler.apply(hidden_states, mtp_loss_scale * mtp_loss) + else: + hidden_states = MTPLossAutoScaler.apply( + hidden_states, mtp_loss_scale * mtp_loss / num_tokens + ) + + return hidden_states + + class MultiTokenPredictionLayer(MegatronModule): """The implementation for Multi-Token Prediction (MTP) which extends the prediction scope to multiple future tokens at each position. @@ -613,6 +686,9 @@ def __init__( layer_number: int = 1, vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + # For Mamba path - pattern and submodules to build inner layers directly + mtp_layer_pattern: Optional[str] = None, + mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.sequence_parallel = config.sequence_parallel @@ -620,14 +696,31 @@ def __init__( self.layer_number = layer_number + get_mtp_layer_offset(self.config, vp_stage) self.vp_stage = vp_stage self.cp_group = pg_collection.cp + self.mtp_layer_pattern = mtp_layer_pattern - self_attention_spec = self.submodules.transformer_layer.submodules.self_attention - attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') - assert attn_mask_type in SUPPORTED_ATTN_MASK, ( - f"Multi-Token Prediction (MTP) is not jet supported with " - + f"{attn_mask_type} attention mask type." - + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." - ) + # Validate attention mask type if using transformer-based inner layers + if self.submodules.mtp_model_layer is not None and hasattr( + self.submodules.mtp_model_layer, 'submodules' + ): + if hasattr(self.submodules.mtp_model_layer.submodules, 'attention_layer'): + self_attention_spec = self.submodules.mtp_model_layer.submodules.attention_layer + if self_attention_spec.submodules.self_attention is not None: + self_attention_spec = self_attention_spec.submodules.self_attention + attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') + assert attn_mask_type in SUPPORTED_ATTN_MASK, ( + f"Multi-Token Prediction (MTP) is not yet supported with " + f"{attn_mask_type} attention mask type. " + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." + ) + elif hasattr(self.submodules.mtp_model_layer.submodules, 'self_attention'): + self_attention_spec = self.submodules.mtp_model_layer.submodules.self_attention + if self_attention_spec is not None: + attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') + assert attn_mask_type in SUPPORTED_ATTN_MASK, ( + f"Multi-Token Prediction (MTP) is not yet supported with " + f"{attn_mask_type} attention mask type. " + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." + ) self.enorm = build_module( self.submodules.enorm, @@ -658,17 +751,37 @@ def __init__( bias=False, skip_bias_add=False, is_expert=False, + tp_comm_buffer_name="mtp_eh_proj", ) - diff_transformer_layer_offset = self.config.num_layers - get_transformer_layer_offset( - self.config, vp_stage - ) - self.transformer_layer = build_module( - self.submodules.transformer_layer, - config=self.config, - vp_stage=vp_stage, - layer_number=self.layer_number + diff_transformer_layer_offset, - ) + # Build inner layers: two possible paths + # 1. Mamba path: use MambaStack for hybrid pattern support + # 2. GPT path: single TransformerLayer + if mtp_layer_pattern is not None and mamba_submodules is not None: + from megatron.core.ssm.mamba_block import MambaStack + + self.mtp_model_layer = MambaStack( + config=self.config, + submodules=mamba_submodules, + hybrid_override_pattern=mtp_layer_pattern, + pre_process=True, # Always receives input from eh_proj + post_layer_norm=False, # MTP has its own final_layernorm + post_process=True, # MTP layer is self-contained + pg_collection=pg_collection, + is_mtp_layer=True, + ) + elif self.config.mtp_num_layers is not None: + # GPT path: Uses the transformer block spec for MTP layer + # MTP inner layers use their own layer numbering (self.layer_number = 1, 2, etc.) + # rather than continuing from decoder layer numbers. This is consistent with the + # Mamba path and ensures proper aux loss tracking in router.py. + self.mtp_model_layer = build_module( + self.submodules.mtp_model_layer, + config=self.config, + vp_stage=self.vp_stage, + layer_number=self.layer_number, + is_mtp_layer=True, + ) self.final_layernorm = build_module( self.submodules.layer_norm, @@ -779,7 +892,6 @@ def _proj_and_transformer_layer( transformer_layer_fp8_context = nullcontext() # TODO: currently ignoring FP4 in MTP layers because we need more numerical validation - with rng_context: with fp8_context: hidden_states = self._concat_embeddings(hidden_states, decoder_input) @@ -788,19 +900,29 @@ def _proj_and_transformer_layer( # transformer layer is cudagraphed, the FP8GlobalStateManager.is_first_fp8_module() is # True so that the fp8 weight caching can be triggered correctly. with transformer_layer_fp8_context: - hidden_states, _ = self.transformer_layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - context=context, - context_mask=context_mask, - rotary_pos_emb=rotary_pos_emb, - rotary_pos_cos=rotary_pos_cos, - rotary_pos_sin=rotary_pos_sin, - attention_bias=attention_bias, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - sequence_len_offset=sequence_len_offset, - ) + if self.mtp_layer_pattern is not None: + hidden_states = self.mtp_model_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + inference_context=inference_params, + packed_seq_params=packed_seq_params, + ) + else: + # GPT path: single TransformerLayer + hidden_states, _ = self.mtp_model_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + attention_bias=attention_bias, + inference_params=inference_params, + packed_seq_params=packed_seq_params, + sequence_len_offset=sequence_len_offset, + ) hidden_states = self._postprocess(hidden_states) @@ -897,8 +1019,7 @@ def forward( Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape [s, b, h], and optionally the updated context tensor if cross-attention is used. """ - assert context is None, f"multi token prediction + cross attention is not yet supported." - + assert context is None, "multi token prediction + cross attention is not yet supported." input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings( input_ids=input_ids, position_ids=position_ids, @@ -907,6 +1028,15 @@ def forward( packed_seq_params=packed_seq_params, ) + # Roll RoPE to match rolled positions (position_ids were rolled in _get_embeddings) + # After rolling, index i should use RoPE for position i+1 + if rotary_pos_emb is not None: + rotary_pos_emb = torch.roll(rotary_pos_emb, shifts=-1, dims=0) + if rotary_pos_cos is not None: + rotary_pos_cos = torch.roll(rotary_pos_cos, shifts=-1, dims=0) + if rotary_pos_sin is not None: + rotary_pos_sin = torch.roll(rotary_pos_sin, shifts=-1, dims=0) + if self.config.recompute_granularity == 'full' and self.training: hidden_states = self._checkpointed_forward( self._proj_and_transformer_layer, @@ -1022,6 +1152,9 @@ class MultiTokenPredictionBlock(MegatronModule): the linear projection. The combined serves as the input of the Transformer block at the k-th depth to produce the output representation. + When `mtp_use_repeated_layer=True` in config, instead of creating N separate MTP layers, + only 1 layer is created and applied mtp_num_layers times. + for more information, please refer to DeepSeek-V3 Technical Report https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf """ @@ -1032,11 +1165,26 @@ def __init__( spec: Union[TransformerBlockSubmodules, ModuleSpec], vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + # New: For Mamba path with unified pattern syntax + mtp_layer_pattern: Optional[str] = None, + mtp_num_depths: int = 0, + mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.submodules = _get_mtp_block_submodules(config, spec) self.mtp_loss_scaling_factor = config.mtp_loss_scaling_factor self.vp_stage = vp_stage + self.mtp_layer_pattern = mtp_layer_pattern + self.mtp_num_depths = mtp_num_depths + self.mamba_submodules = mamba_submodules + self.mtp_use_repeated_layer = self.config.mtp_use_repeated_layer + + vp_size = config.virtual_pipeline_model_parallel_size + assert is_vp_last_stage(vp_stage=vp_stage, vp_size=vp_size), ( + f"MTP layers must be placed on the last virtual pipeline stage. " + f"Got vp_stage={vp_stage} with vp_size={vp_size}. " + f"Placing MTP layers on different VPP stages is not currently supported." + ) # Initialize Context Parallelism (CP) support for MTP # This enables MTP to work with CP > 1 by providing the CP process group @@ -1055,7 +1203,14 @@ def __init__( self.cp_group = pg_collection.cp def _build_layers(self, pg_collection): - def build_layer(layer_spec, layer_number): + # Determine number of depths to build + if self.mtp_num_depths > 0: + num_depths = self.mtp_num_depths + else: + num_depths = self.config.mtp_num_layers or len(self.submodules.layer_specs) + + def build_layer_legacy(layer_spec, layer_number): + """Build layer using legacy spec-based approach.""" fp8_init_context = get_fp8_context(self.config, is_init=True) with fp8_init_context: module = build_module( @@ -1064,15 +1219,71 @@ def build_layer(layer_spec, layer_number): layer_number=layer_number, vp_stage=self.vp_stage, pg_collection=pg_collection, + mtp_layer_pattern=self.mtp_layer_pattern, ) return module - self.layers = torch.nn.ModuleList( - [ - build_layer(layer_spec, i + 1) - for i, layer_spec in enumerate(self.submodules.layer_specs) - ] - ) + def build_layer_with_pattern(layer_spec, layer_number, mtp_layer_pattern, mamba_submodules): + """Build layer using pattern-based approach (new Mamba path).""" + fp8_init_context = get_fp8_context(self.config, is_init=True) + with fp8_init_context: + module = build_module( + layer_spec, + config=self.config, + layer_number=layer_number, + vp_stage=self.vp_stage, + pg_collection=pg_collection, + mtp_layer_pattern=mtp_layer_pattern, + mamba_submodules=mamba_submodules, + ) + return module + + # New Mamba path: use mtp_layer_pattern and mamba_submodules + if self.mtp_layer_pattern is not None and self.mamba_submodules is not None: + if self.mtp_use_repeated_layer: + # Shared/repeated layer: build one layer, use it for all depths + layer_spec = self.submodules.layer_specs[0] + shared_layer = build_layer_with_pattern( + layer_spec, + layer_number=1, + mtp_layer_pattern=self.mtp_layer_pattern, + mamba_submodules=self.mamba_submodules, + ) + self.layers = torch.nn.ModuleList([shared_layer]) + else: + # Non-shared: each depth gets its own layers + self.layers = torch.nn.ModuleList( + [ + build_layer_with_pattern( + self.submodules.layer_specs[ + min(i, len(self.submodules.layer_specs) - 1) + ], + layer_number=i + 1, + mtp_layer_pattern=self.mtp_layer_pattern, + mamba_submodules=self.mamba_submodules, + ) + for i in range(num_depths) + ] + ) + elif self.mtp_use_repeated_layer: + # Legacy repeated layer mode + if len(self.submodules.layer_specs) != 1: + warnings.warn( + "Repeated MTP mode expects exactly 1 layer spec, got " + f"{len(self.submodules.layer_specs)} instead. " + f"The first layer will be applied {self.config.mtp_num_layers} times." + ) + self.layers = torch.nn.ModuleList( + [build_layer_legacy(self.submodules.layer_specs[0], layer_number=1)] + ) + else: + # Legacy mode: build from layer_specs + self.layers = torch.nn.ModuleList( + [ + build_layer_legacy(layer_spec, i + 1) + for i, layer_spec in enumerate(self.submodules.layer_specs) + ] + ) def forward( self, @@ -1108,8 +1319,9 @@ def forward( offset = get_mtp_layer_offset(self.config, self.vp_stage) hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] - for layer_number in range(len(self.layers)): - (hidden_states, input_ids, position_ids) = self.layers[layer_number]( + for iteration in range(self.config.mtp_num_layers): + layer_idx = 0 if self.mtp_use_repeated_layer else iteration + (hidden_states, input_ids, position_ids) = self.layers[layer_idx]( input_ids=input_ids, position_ids=position_ids, hidden_states=hidden_states, @@ -1151,7 +1363,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' for layer in self.layers: offset = get_mtp_layer_offset(self.config, self.vp_stage) - sharded_prefix = f'{layer_prefix}{layer.layer_number - 1 }.' + sharded_prefix = f'{layer_prefix}{layer.layer_number - 1}.' state_dict_prefix = f'{layer_prefix}{layer.layer_number - 1 - offset}.' sharded_pp_offset = [] diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index eaae585905e..96f7e9b8b95 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -59,6 +59,15 @@ class TransformerConfig(ModelParallelConfig): which serves as an additional training objective. """ + mtp_use_repeated_layer: bool = False + """Use a single MTP layer repeatedly instead of multiple separate layers.""" + + mtp_hybrid_override_pattern: Optional[str] = None + """DEPRECATED: Use unified hybrid_override_pattern instead. + Legacy argument for loading old checkpoints. + Force a specific hybrid layer pattern for MTP layers. + """ + num_layers_in_first_pipeline_stage: Optional[int] = None """Number of transformer layers on first pipeline stage. None implies equal layer division across PP ranks.""" diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index a5eaec92866..ae505f04fc6 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -268,6 +268,7 @@ def __init__( hidden_dropout: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, vp_stage: Optional[int] = None, + is_mtp_layer: bool = False, ): self.submodules_config = submodules super().__init__(config=config, vp_stage=vp_stage) @@ -277,10 +278,18 @@ def __init__( self.pg_collection = pg_collection self.tp_group = pg_collection.tp - self.layer_number = layer_number + get_transformer_layer_offset( - self.config, vp_stage, get_pg_rank(pg_collection.pp) - ) + # MTP inner layers use their own layer numbering (starting from 1 within each MTP depth), + # so they should NOT add the decoder layer offset. The router.py handles MTP layer + # numbering separately by adding config.num_layers to distinguish MTP layers from decoder + # layers in the aux loss tracker. + if is_mtp_layer: + self.layer_number = layer_number + else: + self.layer_number = layer_number + get_transformer_layer_offset( + self.config, vp_stage, get_pg_rank(pg_collection.pp) + ) self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout + self.is_mtp_layer = is_mtp_layer # [Module 1: Input Layernorm] Optional Layernorm on the input data # TODO: add pytorch only layernorm @@ -351,6 +360,9 @@ def __init__( if isinstance(submodules.mlp, ModuleSpec): if submodules.mlp.module in (MoELayer, GroupedMLP, TEGroupedMLP, SequentialMLP): additional_mlp_kwargs["pg_collection"] = pg_collection + # Pass is_mtp_layer flag to MoELayer to distinguish MTP MoE layers. + if submodules.mlp.module == MoELayer: + additional_mlp_kwargs["is_mtp_layer"] = self.is_mtp_layer elif submodules.mlp.module == MLP: assert hasattr( pg_collection, 'tp' diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 46f3c28b1da..51a123e78c8 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -499,6 +499,79 @@ def validate_args(args, defaults={}): print_rank_0('setting global batch size to {}'.format(args.global_batch_size)) assert args.global_batch_size > 0 + # === MTP validation === + # Deprecation warnings for legacy MTP arguments + if args.mtp_hybrid_override_pattern is not None: + warn_rank_0( + "--mtp-hybrid-override-pattern is deprecated. " + "For new hybrid models with MTP models, use unified --hybrid-override-pattern instead. " + "Example: 'M*M*/MM/MM' means main='M*M*', MTP pattern='MM' with 2 depths. " + "This argument is kept only for loading old checkpoints.", + args.rank, + ) + + # Backward compatibility: convert legacy mtp_hybrid_override_pattern to unified format + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, parse_hybrid_pattern + sep = Symbols.MTP_SEPARATOR + if ( + getattr(args, 'mtp_hybrid_override_pattern', None) is not None + and args.mtp_num_layers is not None + and args.mtp_num_layers > 0 + and (args.hybrid_override_pattern is None or sep not in args.hybrid_override_pattern) + ): + main_pattern = args.hybrid_override_pattern or '' + mtp_pattern = args.mtp_hybrid_override_pattern + args.hybrid_override_pattern = main_pattern + sep + sep.join([mtp_pattern] * args.mtp_num_layers) + args.mtp_hybrid_override_pattern = None + print_rank_0(f"Converted legacy MTP pattern to unified: {args.hybrid_override_pattern}") + + # Infer mtp_num_layers from unified pattern + if args.hybrid_override_pattern and sep in args.hybrid_override_pattern: + parsed = parse_hybrid_pattern(args.hybrid_override_pattern) + if parsed.mtp_pattern and parsed.mtp_num_depths > 0: + inferred_mtp_num_layers = parsed.mtp_num_depths + if args.mtp_num_layers is None: + args.mtp_num_layers = inferred_mtp_num_layers + elif args.mtp_num_layers != inferred_mtp_num_layers: + warn_rank_0( + f"--mtp-num-layers ({args.mtp_num_layers}) conflicts with " + f"MTP depth count ({inferred_mtp_num_layers}) in pattern '{args.hybrid_override_pattern}'. " + f"Using the inferred value ({inferred_mtp_num_layers}).", + args.rank + ) + args.mtp_num_layers = inferred_mtp_num_layers + + # MTP validation + if args.mtp_num_layers: + assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." + assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( + f"Multi-Token Prediction (MTP) is not supported with {args.position_embedding_type} position embedding type." + + f"The supported position embedding types are rope and none." + ) + + # Validate MTP args for hybrid vs non-hybrid models + if args.is_hybrid_model: + # Mamba/hybrid model MTP validation + if args.mtp_num_layers and not (args.hybrid_override_pattern and sep in args.hybrid_override_pattern): + # Hybrid model wants MTP but no unified pattern - check for legacy args + if args.mtp_hybrid_override_pattern is None: + warn_rank_0( + "Hybrid model with --mtp-num-layers but no MTP pattern. " + "Use unified --hybrid-override-pattern with '/' separator (e.g., 'M*M*/MM/MM') " + "or legacy --mtp-hybrid-override-pattern for old checkpoints.", + args.rank + ) + else: + # Non-hybrid (GPT) model MTP validation + if args.mtp_hybrid_override_pattern is not None: + warn_rank_0( + "--mtp-hybrid-override-pattern is for Mamba/hybrid models only. " + "For GPT models, MTP replicates the main transformer layer structure. " + "This argument will be ignored.", + args.rank + ) + # === End of MTP validation === + # Uneven virtual pipeline parallelism assert ( int(args.num_layers_per_virtual_pipeline_stage is not None) diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index a3d307f1e30..f964b8dd32e 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1418,6 +1418,12 @@ def _set_arg(arg_name, old_arg_name=None, force=False): _set_arg('hidden_dropout', force=True) _set_arg('hybrid_override_pattern', force=True) + + # Legacy MTP pattern for old checkpoints + _set_arg('mtp_hybrid_override_pattern', force=True) + _set_arg('mtp_num_layers', force=True) + _set_arg('mtp_use_repeated_layer', force=True) + _set_arg('spec', force=True) _set_arg('hybrid_attention_ratio', force=True) _set_arg('hybrid_mlp_ratio', force=True) diff --git a/megatron/training/training.py b/megatron/training/training.py index 87d9fe8b841..5206b526e18 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -220,10 +220,20 @@ def num_floating_point_operations(args, batch_size): def calculate_layer_counts(): """Calculate the number of attention, Mamba, and MLP layers.""" if args.hybrid_override_pattern: - counts = {'M': 0, '*': 0, '-': 0, 'E':0} - for layer_type in args.hybrid_override_pattern: - if layer_type in counts: - counts[layer_type] += 1 + from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern + # Parse unified pattern to separate main and MTP components + parsed = parse_hybrid_pattern(args.hybrid_override_pattern) + counts = {'M': 0, '*': 0, '-': 0, 'E': 0} + # Count main decoder layers + if parsed.main_pattern: + for layer_type in parsed.main_pattern: + if layer_type in counts: + counts[layer_type] += 1 + # Count MTP layers (pattern repeated mtp_num_depths times) + if parsed.mtp_pattern and parsed.mtp_num_depths > 0: + for layer_type in parsed.mtp_pattern: + if layer_type in counts: + counts[layer_type] += parsed.mtp_num_depths return counts['*'], counts['M'], counts['-'], counts['E'] else: num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio) @@ -300,7 +310,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, mlp_expansion=4.0, swiglu=False, moe_latent_size=None, moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1, - vocab_size=256000): + vocab_size=256000, mtp_num_layers=0): """Calculate total FLOPs for the hybrid model.""" flops_fwd = ( num_attn_layers * attn_layer_flops(batch_size, seq_len, hidden_size, @@ -313,7 +323,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, shared_expert_ffn_hidden_size, num_experts_routed_to, moe_latent_size, swiglu) + - (2 * batch_size * seq_len * hidden_size * vocab_size) # logits computation + (2 * batch_size * seq_len * hidden_size * vocab_size * (1 + mtp_num_layers)) # logits computation ) return flops_fwd * 3 @@ -604,6 +614,7 @@ def transformer_flops(): else args.moe_shared_expert_intermediate_size), num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, + mtp_num_layers=args.mtp_num_layers, ) else: # Compute standard Transformer model FLOPs. diff --git a/pretrain_mamba.py b/pretrain_mamba.py index e1379be63e9..c41c485c866 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -257,6 +257,7 @@ def forward_step(data_iterator, model: MambaModel): attention_mask, labels=labels, packed_seq_params=packed_seq_params, + loss_mask=loss_mask ) # [ModelOpt]: model is needed to access ModelOpt distillation losses diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index 3c7ae93a17c..9c581ec6cb4 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -193,9 +193,11 @@ "moe_z_loss_coeff": None, "moe_enable_routing_replay": False, "mrope_section": None, + "mtp_hybrid_override_pattern": None, "mtp_loss_scaling_factor": 0.1, "mtp_num_layers": None, "mtp_standalone": False, + "mtp_use_repeated_layer": False, "multi_latent_attention": False, "no_rope_freq": None, "no_sync_func": None, diff --git a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py index 77d02c69607..77c106c3bee 100644 --- a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py +++ b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py @@ -6,7 +6,12 @@ import pytest import torch -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, allocate_layers +from megatron.core.ssm.mamba_hybrid_layer_allocation import ( + ParsedHybridPattern, + Symbols, + allocate_layers, + parse_hybrid_pattern, +) @pytest.mark.internal @@ -75,3 +80,135 @@ def test_wrong_length_override_pattern(self): def test_wrong_number_of_layer_types_in_override_pattern(self): # This override_pattern has too many mlps and not enough attention layer_types = allocate_layers(8, 0.5, 0.25, "M*--M**-") + + +@pytest.mark.internal +class TestParseHybridPattern: + """Tests for parse_hybrid_pattern with unified pattern syntax.""" + + def test_none_pattern(self): + """Test that None pattern returns all None values.""" + result = parse_hybrid_pattern(None) + assert result.main_pattern is None + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_main_pattern_only(self): + """Test patterns without MTP (no / separator).""" + test_cases = [ + ("M*M*", "M*M*"), + ("MMMM", "MMMM"), + ("*M*M", "*M*M"), + ("MM-*", "MM-*"), + ("E", "E"), + ] + for pattern, expected_main in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_main_with_single_mtp_depth(self): + """Test patterns with 1 MTP depth.""" + test_cases = [ + ("M*M*/MM", "M*M*", "MM", 1), + ("MMMM/*M", "MMMM", "*M", 1), + ("M/M", "M", "M", 1), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_main_with_multiple_mtp_depths(self): + """Test patterns with multiple MTP depths.""" + test_cases = [ + ("M*M*/MM/MM", "M*M*", "MM", 2), + ("M*M*/MM/MM/MM", "M*M*", "MM", 3), + ("MMMM/*M/*M/*M", "MMMM", "*M", 3), + ("M*/*/*/*", "M*", "*", 3), + ("M/M/M/M/M", "M", "M", 4), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_mtp_patterns_must_be_identical(self): + """Test that mismatched MTP patterns raise ValueError.""" + invalid_patterns = [ + "M*M*/MM/M*", # MM != M* + "M*M*/MM/MM/M", # MM != M + "MMMM/*M/M*", # *M != M* + ] + for pattern in invalid_patterns: + with pytest.raises(ValueError, match="All MTP patterns must be identical"): + parse_hybrid_pattern(pattern) + + def test_invalid_symbols_in_main_pattern(self): + """Test that invalid symbols in main pattern raise ValueError.""" + invalid_patterns = [ + "M*X*", # X is not valid + "MaMM", # a is not valid + "M*M*1", # 1 is not valid + ] + for pattern in invalid_patterns: + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern(pattern) + + def test_invalid_symbols_in_mtp_pattern(self): + """Test that invalid symbols in MTP pattern raise ValueError.""" + # Single MTP depth with invalid symbol - should raise "not a valid layer symbol" + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern("M*M*/MX") # X is not valid + + # Multiple MTP depths with invalid symbol and matching patterns + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern("M*M*/Ma/Ma") # a is not valid + + # Multiple MTP depths with invalid symbol but mismatched patterns + # This raises "All MTP patterns must be identical" before checking symbols + with pytest.raises(ValueError, match="All MTP patterns must be identical"): + parse_hybrid_pattern("M*M*/MM/Ma") + + def test_empty_main_pattern_with_mtp(self): + """Test pattern that starts with / (empty main pattern).""" + result = parse_hybrid_pattern("/MM/MM") + assert result.main_pattern is None + assert result.mtp_pattern == "MM" + assert result.mtp_num_depths == 2 + + def test_trailing_separator(self): + """Test patterns with trailing separator.""" + # "M*M*/" means main="M*M*", one empty MTP pattern + result = parse_hybrid_pattern("M*M*/") + assert result.main_pattern == "M*M*" + # Empty string after separator means no valid MTP pattern + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_complex_patterns(self): + """Test more complex realistic patterns.""" + test_cases = [ + # Main decoder with attention, MTP with mamba only + ("M*M*M*M*/MMM/MMM", "M*M*M*M*", "MMM", 2), + # Main decoder with MLP, MTP with attention+mamba + ("MM-MM-/*M/*M", "MM-MM-", "*M", 2), + # All attention main, mamba MTP + ("*****/M/M/M/M", "*****", "M", 4), + # MoE in main pattern + ("MEME/MM/MM", "MEME", "MM", 2), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_dataclass_equality(self): + """Test that ParsedHybridPattern supports equality comparison.""" + p1 = parse_hybrid_pattern("M*M*/MM/MM") + p2 = ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) + assert p1 == p2 diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py index 05fb2c4fe63..ec72d713eb1 100644 --- a/tests/unit_tests/transformer/test_multi_token_prediction.py +++ b/tests/unit_tests/transformer/test_multi_token_prediction.py @@ -13,6 +13,8 @@ get_gpt_mtp_block_spec, ) from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec +from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import get_context_parallel_group @@ -94,7 +96,7 @@ def test_constructor_local(self, tp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].transformer_layer is not None + assert mtp.layers[i].mtp_model_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -120,7 +122,7 @@ def test_constructor_ues_te(self, tp, cp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].transformer_layer is not None + assert mtp.layers[i].mtp_model_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -162,7 +164,7 @@ def model_provider( config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=args.vocal_size, + vocab_size=args.vocab_size, max_sequence_length=args.max_position_embeddings, pre_process=pre_process, post_process=post_process, @@ -186,7 +188,7 @@ def create_test_args( args.num_layers = 2 args.mtp_num_layers = 2 args.mtp_loss_scaling_factor = 0.1 - args.vocal_size = 128800 + args.vocab_size = 128800 args.hidden_size = 128 args.num_attention_heads = 8 args.max_position_embeddings = 256 @@ -677,10 +679,259 @@ def log(self, metrics, iteration): # Verify total_loss_dict is populated for i in range(num_layers): - assert f"mtp_{i+1} loss" in total_loss_dict - assert total_loss_dict[f"mtp_{i+1} loss"] == loss * loss_scale + assert f"mtp_{i + 1} loss" in total_loss_dict + assert total_loss_dict[f"mtp_{i + 1} loss"] == loss * loss_scale # Verify tracker is cleaned assert torch.all(MTPLossLoggingHelper.tracker["values"] == 0) assert MTPLossLoggingHelper.tracker["reduce_group"] is None assert MTPLossLoggingHelper.tracker["avg_group"] is None + + +class TestMultiTokenPredictionMamba: + """Test Multi-Token Prediction with Mamba hybrid models.""" + + def setup_method(self, method): + self.seq_length = 32 + self.micro_batch_size = 2 + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + + def teardown_method(self, method): + Utils.destroy_model_parallel() + destroy_global_vars() + destroy_num_microbatches_calculator() + MTPLossLoggingHelper.tracker = {} + + def model_provider(self, pre_process=True, post_process=True, **config_kwargs): + """Model provider for Mamba hybrid models with MTP. + + Uses the unified pattern syntax where MTP is configured via hybrid_override_pattern: + Format: "///..." + Example: "M*M*/M*/M*" = main decoder "M*M*", MTP pattern "M*" with 2 depths + """ + model_parallel_cuda_manual_seed(_SEED) + args = get_args() + config = core_transformer_config_from_args(args) + + # MTP is configured via unified pattern in hybrid_override_pattern + # MambaModel creates the MTP block internally based on the parsed pattern + model = MambaModel( + config=config, + mamba_stack_spec=mamba_stack_spec, + vocab_size=args.vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + hybrid_attention_ratio=args.hybrid_attention_ratio, + hybrid_mlp_ratio=args.hybrid_mlp_ratio, + hybrid_override_pattern=args.hybrid_override_pattern, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + ) + return model + + def create_test_args( + self, tp, cp, sequence_length, micro_batch_size, fp8=None, full_recompute=False + ): + destroy_global_vars() + destroy_num_microbatches_calculator() + + sys.argv = ['test_multi_token_prediction_mamba.py'] + args = parse_args() + args.num_layers = 4 + args.mtp_num_layers = 2 + args.mtp_loss_scaling_factor = 0.1 + args.vocab_size = 128800 + args.hidden_size = 128 + args.num_attention_heads = 8 + args.num_query_groups = 8 + args.mamba_num_groups = 4 + args.max_position_embeddings = 256 + args.micro_batch_size = micro_batch_size + args.create_attention_mask_in_dataloader = True + args.seq_length = sequence_length + args.tensor_model_parallel_size = tp + args.sequence_parallel = True if tp > 1 else False + args.context_parallel_size = cp + args.position_embedding_type = 'rope' + args.train_iters = 1 + args.ckpt_format = 'torch_dist' + args.lr = 3e-5 + args.attention_dropout = 0.0 + args.hidden_dropout = 0.0 + args.async_tensor_model_parallel_allreduce = False + args.no_save_optim = True + args.no_load_optim = True + args.no_load_rng = True + args.bf16 = True + args.hybrid_attention_ratio = 0.5 + args.hybrid_mlp_ratio = 0.0 + # Unified pattern: "main/mtp/mtp" - main decoder "M*M*", MTP pattern "M*" with 2 depths + args.hybrid_override_pattern = "M*M*/M*/M*" + args.spec = "megatron.core.models.mamba.mamba_layer_specs.mamba_stack_spec" + + if fp8 is not None: + args.fp8 = 'e4m3' + if full_recompute: + args.recompute_granularity = 'full' + args.recompute_method = 'uniform' + args.recompute_num_layers = 1 + else: + args.recompute_granularity = None + args.add_bias_linear = False + args.swiglu = True + + validate_args(args) + set_global_variables(args, False) + return args + + def get_batch(self, seq_length, micro_batch_size): + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones( + (micro_batch_size, 1, seq_length, seq_length), dtype=bool + ).cuda() + loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + batch = { + 'tokens': input_ids, + 'labels': labels, + 'loss_mask': loss_mask, + 'attention_mask': attention_mask, + 'position_ids': position_ids, + } + return batch + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) + def test_sharded_state_dict_mamba(self, tp, cp): + """Test MTP with Mamba hybrid model - sharded state dict.""" + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) + mamba_model = unwrap_model(mamba_model) + sharded_state_dict = mamba_model[0].sharded_state_dict() + + # Verify MTP layers are in the state dict + for i in range(args.mtp_num_layers): + assert f"mtp.layers.{i}.enorm.weight" in sharded_state_dict.keys() + assert f"mtp.layers.{i}.hnorm.weight" in sharded_state_dict.keys() + assert f"mtp.layers.{i}.eh_proj.weight" in sharded_state_dict.keys() + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) + def test_forward_backward_mamba(self, tmp_path_dist_ckpt, tp, cp): + """Test MTP forward and backward with Mamba hybrid model.""" + tp_ref = 1 + cp_ref = 1 + args = self.create_test_args(tp_ref, cp_ref, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_ref, context_parallel_size=cp_ref + ) + batch = self.get_batch(self.seq_length, self.micro_batch_size) + tokens, labels, loss_mask, attention_mask, position_ids = batch.values() + + mamba_model_ref, optimizer, opt_param_scheduler = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + + output_ref = mamba_model_ref[0].forward( + input_ids=tokens, + position_ids=position_ids, + attention_mask=attention_mask, + labels=labels, + loss_mask=loss_mask, + ) + tracker = MTPLossLoggingHelper.tracker + mtp_loss_ref = None + assert "values" in tracker + mtp_loss_ref = tracker['values'].clone() + MTPLossLoggingHelper.clean_loss_in_tracker() + + iteration = 123 + num_floating_point_operations_so_far = 456 + + def set_ckpt_path(ckpt_path): + args.save = ckpt_path + args.load = ckpt_path + + with TempNamedDir(tmp_path_dist_ckpt / 'test_mtp_mamba_model_reconfiguration') as ckpt_dir: + set_ckpt_path(ckpt_dir) + save_checkpoint( + iteration, + mamba_model_ref, + optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + ) + + expected_ckpt_path = args.save / "iter_0000123" / ".metadata" + assert os.path.exists(expected_ckpt_path) + + Utils.destroy_model_parallel() + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + set_ckpt_path(ckpt_dir) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + mamba_model, optimizer, opt_param_scheduler = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + load_checkpoint(mamba_model, optimizer, opt_param_scheduler, strict=False) + + batch["output_ref"] = output_ref + batch = get_batch_on_this_cp_rank(batch) + tokens, labels, loss_mask, attention_mask, position_ids, output_ref = batch.values() + output = mamba_model[0].forward( + input_ids=tokens, + position_ids=position_ids, + attention_mask=attention_mask, + labels=labels, + loss_mask=loss_mask, + ) + tracker = MTPLossLoggingHelper.tracker + assert "values" in tracker + mtp_loss = tracker['values'].clone() + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['cp']) + torch.distributed.all_reduce( + mtp_loss, group=pg_collection.cp, op=torch.distributed.ReduceOp.AVG + ) + MTPLossLoggingHelper.clean_loss_in_tracker() + assert torch.allclose(output_ref, output, rtol=1e-03, atol=1e-03) + assert torch.allclose(mtp_loss, mtp_loss_ref, rtol=1e-02, atol=1e-02) + + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length / cp + + loss = output.mean() + loss.backward() + for name, param in mamba_model[0].named_parameters(): + assert param.main_grad is not None + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + def test_attention_mask_validation_mamba(self): + """Test that attention mask type validation works for Mamba hybrid models.""" + tp = 1 + cp = 1 + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + try: + mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) + mamba_model = unwrap_model(mamba_model) + assert isinstance(mamba_model[0], MambaModel) + assert mamba_model[0].mtp is not None + except AssertionError as e: + if "Multi-Token Prediction (MTP) is not yet supported" in str(e): + pytest.fail(f"Attention mask validation failed for Mamba hybrid model: {e}") + else: + raise From dceb1fb7ce1d18c19c38815f3573180b6d527701 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Mon, 2 Feb 2026 00:28:05 -0600 Subject: [PATCH 021/231] docs: improve Megatron-LM and Megatron Core descriptions (#3115) Signed-off-by: Santosh Bhavani Co-authored-by: Xin Yao --- README.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6fa300a6d4d..3551e74762c 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,13 @@ Megatron-LM and Megatron Core ## About -**Megatron-Core (MCore)**: Composable library with GPU-optimized building blocks for custom training frameworks. -You can install this library using pip or use it within the Megatron-LM GitHub repository. +This repository contains two components: **Megatron-LM** and **Megatron Core**. -**Megatron-LM**: Reference implementation that includes end-to-end examples utilizing Megatron Core. +**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation. -**Megatron-Bridge**: Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and example model training recipes. +**Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines. -For more information, refer to [Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge). +**[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** provides bidirectional Hugging Face ↔ Megatron checkpoint conversion with production-ready recipes. ## Quick Start @@ -44,7 +43,8 @@ Install Megatron Core with pip: # Latest News -- **[2025/12]** 🎉 **Megatron Core development has moved to GitHub!** All development and CI now happens in the open. We welcome community contributions. +- **[2026/01]** **[Dynamic Context Parallelism](https://developer.nvidia.com/blog/speeding-up-variable-length-training-with-dynamic-context-parallelism-and-nvidia-megatron-core/)** - Up to 1.48x speedup for variable-length sequence training with adaptive CP sizing. +- **[2025/12]** **Megatron Core development has moved to GitHub!** All development and CI now happens in the open. We welcome community contributions. - **[2025/10]** **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features. - **[2025/10]** **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models. - **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements. @@ -57,7 +57,7 @@ Install Megatron Core with pip: - **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)). - **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba). -- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details. +- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. @@ -93,7 +93,7 @@ Megatron-LM/ # Performance Benchmarking -For our latest performance benchmarking results, please refer to [NVIDIA NeMo Framework Performance Summary](https://docs.nvidia.com/nemo/megatron-bridge/latest/performance-summary.html). +For our latest performance benchmarking results, please refer to [NVIDIA Megatron Bridge Performance Summary](https://docs.nvidia.com/nemo/megatron-bridge/latest/performance-summary.html). Our codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters. @@ -131,6 +131,10 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th +# Roadmaps + +- **[MoE Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements + # Resources ## Getting Help From f4502eb1c92f77e0ed190cac00293e8ac192543b Mon Sep 17 00:00:00 2001 From: Ahmad Kiswani Date: Mon, 2 Feb 2026 08:34:23 +0200 Subject: [PATCH 022/231] Handle `step` key correctly in checkpoint save with `--optimizer-cpu-offload` (#2874) --- megatron/core/optimizer/distrib_optimizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index e2b1b0dbd73..4192b0bb73c 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -1661,6 +1661,11 @@ def sharded_param_state_dp_reshardable( if key == 'padding': tensors[key] = LocalNonpersistentObject(tensors[key]) continue + if key == 'step': + # The optimizer state of STEP is a 0-dim tensor and is handled + # separately via param_groups, not as part of the gradient buffer. + tensors[key] = LocalNonpersistentObject(tensors[key]) + continue assert tensors[key].shape == (gbuf_local_end - gbuf_local_start,), ( tensors[key].shape, gbuf_local_start, From e836e6225fbb9e9950058e1dffbb1242e2842b83 Mon Sep 17 00:00:00 2001 From: Deepak Narayanan Date: Mon, 2 Feb 2026 00:58:34 -0800 Subject: [PATCH 023/231] Fix two minor bugs in MTP implementation for hybrid models (#3194) Signed-off-by: Deepak Narayanan --- megatron/core/transformer/cuda_graphs.py | 2 +- megatron/training/training.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 3643c42c3ce..df6f88b3b93 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1738,7 +1738,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): callables.append(layer) callables_is_mtp.append(False) for layer_number in range(num_mtp_layers): - layer = chunk_with_decoder.mtp.layers[layer_number].transformer_layer + layer = chunk_with_decoder.mtp.layers[layer_number].mtp_model_layer if _layer_is_graphable(layer, config): num_graphable_layers += 1 callables.append(layer) diff --git a/megatron/training/training.py b/megatron/training/training.py index 5206b526e18..7b5b6728dd2 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -588,6 +588,9 @@ def transformer_flops(): # Calculate the number of each type of layer. num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers = calculate_layer_counts() + mtp_num_layers = args.mtp_num_layers + if mtp_num_layers is None: + mtp_num_layers = 0 # Compute hybrid model FLOPs. return hybrid_flops( batch_size=batch_size, @@ -614,7 +617,7 @@ def transformer_flops(): else args.moe_shared_expert_intermediate_size), num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, - mtp_num_layers=args.mtp_num_layers, + mtp_num_layers=mtp_num_layers, ) else: # Compute standard Transformer model FLOPs. From 1362e4a51d0bbe9f939d2cbfd0fd51664fdd3aee Mon Sep 17 00:00:00 2001 From: mvirts Date: Mon, 2 Feb 2026 00:03:52 -0700 Subject: [PATCH 024/231] Update README.md (#2111) Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- megatron/core/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/README.md b/megatron/core/README.md index 6f2bb49a1dd..a9134be41cd 100644 --- a/megatron/core/README.md +++ b/megatron/core/README.md @@ -48,4 +48,4 @@ torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py --- -*For complete installation instructions, performance benchmarks, and ecosystem information, see the [main README](../README.md).* +*For complete installation instructions, performance benchmarks, and ecosystem information, see the [main README](../../README.md).* From 70719cdd970ce8bafba4c564d5ee550f9e647ff4 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Mon, 2 Feb 2026 15:04:20 +0800 Subject: [PATCH 025/231] mRoPE for MTP (#3114) Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> --- megatron/training/arguments.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 51a123e78c8..4e1c3bc6f17 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1301,9 +1301,11 @@ def validate_args(args, defaults={}): if args.mtp_num_layers: assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." - assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( - f"Multi-Token Prediction (MTP) is not supported with {args.position_embedding_type} position embedding type." - + f"The supported position embedding types are rope and none." + # MTP is compatible with position embedding types that use position_ids. + supported_position_types = ["learned_absolute", "rope", "mrope", "none"] + assert args.position_embedding_type in supported_position_types, ( + f"Multi-Token Prediction (MTP) is not supported with '{args.position_embedding_type}' position embedding type. " + f"The supported position embedding types are: {', '.join(supported_position_types)}." ) if args.cpu_offloading_num_layers > 0: From 31d0c87127eb683e49d228daf1862c2db2a6f000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 2 Feb 2026 09:48:12 +0000 Subject: [PATCH 026/231] Revert "Fix two minor bugs in MTP implementation for hybrid models (#3194)" This reverts commit e836e6225fbb9e9950058e1dffbb1242e2842b83. --- megatron/core/transformer/cuda_graphs.py | 2 +- megatron/training/training.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index df6f88b3b93..3643c42c3ce 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1738,7 +1738,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): callables.append(layer) callables_is_mtp.append(False) for layer_number in range(num_mtp_layers): - layer = chunk_with_decoder.mtp.layers[layer_number].mtp_model_layer + layer = chunk_with_decoder.mtp.layers[layer_number].transformer_layer if _layer_is_graphable(layer, config): num_graphable_layers += 1 callables.append(layer) diff --git a/megatron/training/training.py b/megatron/training/training.py index 7b5b6728dd2..5206b526e18 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -588,9 +588,6 @@ def transformer_flops(): # Calculate the number of each type of layer. num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers = calculate_layer_counts() - mtp_num_layers = args.mtp_num_layers - if mtp_num_layers is None: - mtp_num_layers = 0 # Compute hybrid model FLOPs. return hybrid_flops( batch_size=batch_size, @@ -617,7 +614,7 @@ def transformer_flops(): else args.moe_shared_expert_intermediate_size), num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, - mtp_num_layers=mtp_num_layers, + mtp_num_layers=args.mtp_num_layers, ) else: # Compute standard Transformer model FLOPs. From a0cc8caec6e448afbdaa7d6ac3e44de8ef3213fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 2 Feb 2026 09:48:21 +0000 Subject: [PATCH 027/231] Revert "Add MTP support for hybrid models (#2363)" This reverts commit 300d1b6550b46b2ce572f78e1c45f5ac2acb7d7f. --- mamba_builders.py | 8 +- .../common/language_module/language_module.py | 30 +- .../common/model_chunk_schedule_plan.py | 2 +- .../core/models/gpt/fine_grained_callables.py | 4 +- megatron/core/models/gpt/gpt_layer_specs.py | 2 +- megatron/core/models/gpt/gpt_model.py | 112 +++++-- .../core/models/mamba/mamba_layer_specs.py | 33 -- megatron/core/models/mamba/mamba_model.py | 91 +---- megatron/core/pipeline_parallel/schedules.py | 5 +- megatron/core/ssm/mamba_block.py | 28 +- .../core/ssm/mamba_hybrid_layer_allocation.py | 149 +-------- megatron/core/transformer/moe/moe_layer.py | 12 +- megatron/core/transformer/moe/router.py | 51 +-- .../transformer/multi_token_prediction.py | 314 +++--------------- .../core/transformer/transformer_config.py | 9 - .../core/transformer/transformer_layer.py | 18 +- megatron/training/arguments.py | 73 ---- megatron/training/checkpointing.py | 6 - megatron/training/training.py | 23 +- pretrain_mamba.py | 1 - .../unit_tests/models/test_mamba_moe_model.py | 2 - .../ssm/test_mamba_hybrid_layer_allocation.py | 139 +------- .../test_multi_token_prediction.py | 263 +-------------- 23 files changed, 205 insertions(+), 1170 deletions(-) diff --git a/mamba_builders.py b/mamba_builders.py index 5d31af60475..6a792ba6ea5 100644 --- a/mamba_builders.py +++ b/mamba_builders.py @@ -8,7 +8,6 @@ from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec - def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None): print_rank_0('building MAMBA model ...') if config is None: @@ -16,10 +15,8 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p assert args.use_legacy_models is False, "Mamba only supported in Mcore!" if config.transformer_impl == "inference_optimized": - mamba_stack_spec = mamba_inference_stack_spec - assert ( - not config.inference_fuse_tp_communication - ), "inference_fuse_tp_communication is not supported for Mamba" + mamba_stack_spec = mamba_inference_stack_spec + assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba" elif args.spec is not None: mamba_stack_spec = import_module(args.spec) else: @@ -42,7 +39,6 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p rotary_percent=args.rotary_percent, rotary_base=args.rotary_base, pg_collection=pg_collection, - vp_stage=vp_stage, ) for l in range(model.decoder.num_layers_per_pipeline_rank): diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index 57975b2958b..b0fa6126b63 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -23,7 +23,6 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group from megatron.core.utils import ( @@ -256,20 +255,12 @@ def setup_embeddings_and_output_layer(self) -> None: LanguageModule.embedding_warning_printed = True def shared_embedding_or_output_weight(self) -> Tensor: - """Gets the embedding weight or output logit weights when share embedding and output weights set to True - or when use Multi-Token Prediction (MTP). + """Gets the emedding weight or output logit weights when share embedding and output weights set to True. Returns: - Tensor: During pre processing or MTP process it returns the input embeddings weight while during post processing it returns the final output layers weight + Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight """ - if self.pre_process or getattr(self, 'mtp_process', False): - # Multi-Token Prediction (MTP) need both embedding layer and output layer. - # So there will be both embedding layer and output layer in the mtp process stage. - # When share_embeddings_and_output_weights is True, the embedding weight is the - # canonical shared weight and is passed to the output layer during forward. - assert hasattr( - self, 'embedding' - ), f"embedding is needed in this pipeline stage, but it is not initialized." + if self.pre_process: return self.embedding.word_embeddings.weight elif self.post_process: return self.output_layer.weight @@ -302,21 +293,6 @@ def sharded_state_dict( output_layer_weight_key = f'{prefix}output_layer.weight' output_layer_bias_key = f'{prefix}output_layer.bias' - # Multi-Token Prediction (MTP) needs embedding layer in mtp process stage. - # If MTP is not placed in the pre processing stage, we need to maintain a copy of - # embedding layer in the mtp process stage and tie it to the embedding in the pre - # processing stage. - # Note: MTP loss is computed at post_process stage, so the output_layer on mtp_process - # rank doesn't need special tying - it's not used for loss computation. - if getattr(self, 'mtp_process', False) and not self.pre_process: - emb_weight = self.embedding.word_embeddings.weight - tie_word_embeddings_state_dict( - sharded_state_dict, - emb_weight, - first_stage_word_emb_key, - tp_group=self.tp_group, - dp_cp_group=metadata['dp_cp_group'], - ) if self.share_embeddings_and_output_weights: self.tie_embeddings_and_output_weights_state_dict( sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 3b0e3a13b76..033e8e808f9 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): # get flags for latter use is_mtp = isinstance(self.layer, MultiTokenPredictionLayer) is_moe = ( - isinstance(self.layer.mtp_model_layer.mlp, MoELayer) + isinstance(self.layer.transformer_layer.mlp, MoELayer) if is_mtp else isinstance(self.layer.mlp, MoELayer) ) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index e17ed0a5d40..7cee9d2973c 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -613,9 +613,9 @@ def build_mtp_layer_callables(layer): multi-token prediction layer nodes (attention, MLP, etc.) """ - forward_funcs, backward_dw = build_transformer_layer_callables(layer.mtp_model_layer) + forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs - is_moe = isinstance(layer.mtp_model_layer.mlp, MoELayer) + is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." def submodule_mtp_attn_forward(node, hidden_states): diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index bebb4350d27..49501ee54eb 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend( raise ValueError(f"Invalid spec: {spec}") mtp_layer_spec = get_mtp_layer_spec_for_backend( - mtp_model_layer_spec=transformer_layer_spec, backend=backend + transformer_layer_spec=transformer_layer_spec, backend=backend ) mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0 mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 4b96465a31e..e287344c13d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -6,7 +6,7 @@ import torch from torch import Tensor -from megatron.core import tensor_parallel +from megatron.core import parallel_state, tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.dist_checkpointing.mapping import ShardedStateDict from megatron.core.inference.contexts import BaseInferenceContext @@ -26,9 +26,11 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer.enums import CudaGraphScope, ModelType from megatron.core.transformer.multi_token_prediction import ( + MTPLossAutoScaler, + MTPLossLoggingHelper, MultiTokenPredictionBlock, - mtp_on_this_rank, - process_mtp_loss, + roll_tensor, + tie_word_embeddings_state_dict, ) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlock @@ -142,9 +144,7 @@ def __init__( self.rotary_base = rotary_base self.rotary_scaling = rope_scaling self.mtp_block_spec = mtp_block_spec - self.mtp_process = mtp_block_spec is not None and mtp_on_this_rank( - self.config, ignore_virtual=False, vp_stage=vp_stage - ) + self.mtp_process = mtp_block_spec is not None if self.pre_process or self.mtp_process: self.embedding = LanguageModelEmbedding( @@ -609,19 +609,56 @@ def _postprocess( return hidden_states if self.config.mtp_num_layers is not None: - hidden_states = process_mtp_loss( - hidden_states=hidden_states, - labels=labels, - loss_mask=loss_mask, - output_layer=self.output_layer, - output_weight=output_weight, - runtime_gather_output=runtime_gather_output, - is_training=self.training, - compute_language_model_loss=self.compute_language_model_loss, - config=self.config, - cp_group=self.pg_collection.cp, - packed_seq_params=packed_seq_params, - ) + mtp_labels = labels.clone() + hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0) + hidden_states = hidden_states_list[0] + if loss_mask is None: + # if loss_mask is not provided, use all ones as loss_mask + loss_mask = torch.ones_like(mtp_labels) + for mtp_layer_number in range(self.config.mtp_num_layers): + # output + mtp_logits, _ = self.output_layer( + hidden_states_list[mtp_layer_number + 1], + weight=output_weight, + runtime_gather_output=runtime_gather_output, + ) + # Calc loss for the current Multi-Token Prediction (MTP) layers. + mtp_labels, _ = roll_tensor( + mtp_labels, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, + ) + loss_mask, num_tokens = roll_tensor( + loss_mask, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, + ) + mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits) + mtp_loss = loss_mask * mtp_loss + if self.training: + # TODO(shifangx): remove the use of parallel_state here + # after moving loss logging to loss_func in pretrain_gpt.py + MTPLossLoggingHelper.save_loss_to_tracker( + torch.sum(mtp_loss) / num_tokens, + mtp_layer_number, + self.config.mtp_num_layers, + avg_group=parallel_state.get_data_parallel_group( + with_context_parallel=True + ), + ) + mtp_loss_scale = self.config.mtp_loss_scaling_factor / self.config.mtp_num_layers + if self.config.calculate_per_token_loss: + hidden_states = MTPLossAutoScaler.apply( + hidden_states, mtp_loss_scale * mtp_loss + ) + else: + hidden_states = MTPLossAutoScaler.apply( + hidden_states, mtp_loss_scale * mtp_loss / num_tokens + ) sequence_parallel_override = False if in_inference_mode and inference_context.materialize_only_last_token_logits: @@ -678,6 +715,27 @@ def _postprocess( return loss + def shared_embedding_or_output_weight(self) -> Tensor: + """Gets the embedding weight or output logit weights when share input embedding and + output weights set to True or when use Multi-Token Prediction (MTP) feature. + + Returns: + Tensor: During pre processing or MTP process it returns the input embeddings weight. + Otherwise, during post processing it returns the final output layers weight. + """ + if self.pre_process or self.mtp_process: + # Multi-Token Prediction (MTP) need both embedding layer and output layer. + # So there will be both embedding layer and output layer in the mtp process stage. + # In this case, if share_embeddings_and_output_weights is True, the shared weights + # will be stored in embedding layer, and output layer will not have any weight. + assert hasattr( + self, 'embedding' + ), f"embedding is needed in this pipeline stage, but it is not initialized." + return self.embedding.word_embeddings.weight + elif self.post_process: + return self.output_layer.weight + return None + def build_schedule_plan( self, input_ids: Tensor, @@ -768,4 +826,20 @@ def sharded_state_dict( output_extra_state and output_extra_state.data ), f'Expected output layer extra state to be empty, got: {output_extra_state}' + # Multi-Token Prediction (MTP) need embedding layer in mtp process stage. + # If MTP is not placed in the pre processing stage, we need to maintain a copy of + # embedding layer in the mtp process stage and tie it to the embedding in the pre + # processing stage. + # Now MTP loss is computed in post processing stage, so the output_layer is not needed. + if self.mtp_process and not self.pre_process: + emb_weight_key = f'{prefix}embedding.word_embeddings.weight' + emb_weight = self.embedding.word_embeddings.weight + tie_word_embeddings_state_dict( + sharded_state_dict, + emb_weight, + emb_weight_key, + tp_group=self.tp_group, + dp_cp_group=metadata['dp_cp_group'], + ) + return sharded_state_dict diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py index 6ca628475be..b87124bab1d 100755 --- a/megatron/core/models/mamba/mamba_layer_specs.py +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -1,7 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. from megatron.core.extensions.transformer_engine import ( - TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, @@ -20,12 +19,6 @@ from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules -from megatron.core.transformer.multi_token_prediction import ( - MultiTokenPredictionBlock, - MultiTokenPredictionBlockSubmodules, - MultiTokenPredictionLayer, - MultiTokenPredictionLayerSubmodules, -) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import ( MoETransformerLayer, @@ -33,7 +26,6 @@ TransformerLayerSubmodules, ) -# This should be private and should not be used outside of this file. moe = get_moe_module_spec( use_te=True, num_experts=8, # Can be any positive integer (must not be None). @@ -41,28 +33,6 @@ moe_use_legacy_grouped_gemm=False, ) - -# MTP block spec for Mamba - provides norms and projection only. -# Inner layers are built by MultiTokenPredictionLayer using nested MambaStack -_mamba_mtp_block_spec = ModuleSpec( - module=MultiTokenPredictionBlock, - submodules=MultiTokenPredictionBlockSubmodules( - layer_specs=[ - ModuleSpec( - module=MultiTokenPredictionLayer, - submodules=MultiTokenPredictionLayerSubmodules( - enorm=TENorm, - hnorm=TENorm, - eh_proj=TEColumnParallelLinear, - mtp_model_layer=None, # Built via pattern + mamba_submodules - layer_norm=TENorm, - ), - ) - ] - ), -) - - mamba_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -117,11 +87,9 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), - mtp_block_spec=_mamba_mtp_block_spec, ), ) - mamba_inference_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -179,6 +147,5 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), - mtp_block_spec=_mamba_mtp_block_spec, ), ) diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 0a783391437..8d45e1d0147 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -16,11 +16,6 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import ModelType -from megatron.core.transformer.multi_token_prediction import ( - MultiTokenPredictionBlock, - mtp_on_this_rank, - process_mtp_loss, -) from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.utils import ( WrappedTensor, @@ -43,11 +38,7 @@ class MambaModel(LanguageModule): hybrid_attention_ratio (float, optional): The target ratio of attention layers to total layers hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers - hybrid_override_pattern (str, optional): Unified hybrid layer pattern with optional MTP. - Format: "///..." - Examples: - - "M*M*" -> main decoder only, no MTP - - "M*M*/MM/MM" -> main="M*M*", mtp="MM", 2 depths + hybrid_override_pattern (str, optional): The hybrid layer pattern to override with post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True. fp16_lm_cross_entropy (bool, optional): Defaults to False. @@ -88,7 +79,6 @@ def __init__( scatter_embedding_sequence_parallel: bool = True, seq_len_interpolation_factor: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, - vp_stage: Optional[int] = None, ) -> None: super().__init__(config=config, pg_collection=pg_collection) @@ -107,27 +97,12 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.position_embedding_type = position_embedding_type - self.vp_stage = vp_stage - - # Parse unified pattern to extract main and MTP components - from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern - - parsed = parse_hybrid_pattern(hybrid_override_pattern) - self.mtp_pattern = parsed.mtp_pattern - self.mtp_num_depths = parsed.mtp_num_depths - - # Determine if MTP is needed (based on pattern parsing) - self.mtp_process = ( - self.mtp_pattern is not None - and self.mtp_num_depths > 0 - and mtp_on_this_rank(self.config, vp_stage=self.vp_stage) - ) # megatron core pipelining currently depends on model type # TODO: remove this dependency ? self.model_type = ModelType.encoder_or_decoder - if self.pre_process or self.mtp_process: + if self.pre_process: self.embedding = LanguageModelEmbedding( config=self.config, vocab_size=self.vocab_size, @@ -153,33 +128,14 @@ def __init__( pre_process=self.pre_process, hybrid_attention_ratio=self.hybrid_attention_ratio, hybrid_mlp_ratio=self.hybrid_mlp_ratio, - hybrid_override_pattern=parsed.main_pattern, + hybrid_override_pattern=self.hybrid_override_pattern, post_process=self.post_process, dtype=config.params_dtype, pg_collection=self.pg_collection, ) - # MTP block - uses mtp_block_spec from mamba_stack_spec.submodules - if self.mtp_process: - mamba_submodules = mamba_stack_spec.submodules - mtp_block_spec = mamba_submodules.mtp_block_spec - assert mtp_block_spec is not None, ( - "MTP pattern specified but mtp_block_spec is None in mamba_stack_spec.submodules. " - "Ensure mamba_stack_spec includes mtp_block_spec for MTP support." - ) - - self.mtp = MultiTokenPredictionBlock( - config=self.config, - spec=mtp_block_spec, - pg_collection=self.pg_collection, - vp_stage=self.vp_stage, - mtp_layer_pattern=self.mtp_pattern, - mtp_num_depths=self.mtp_num_depths, - mamba_submodules=mamba_submodules, - ) - # Output - if post_process or self.mtp_process: + if post_process: self.output_layer = tensor_parallel.ColumnParallelLinear( config.hidden_size, self.vocab_size, @@ -193,7 +149,7 @@ def __init__( tp_group=self.pg_collection.tp, ) - if self.pre_process or self.post_process or self.mtp_process: + if self.pre_process or self.post_process: self.setup_embeddings_and_output_layer() for name, module in self.named_modules(): @@ -228,7 +184,6 @@ def forward( runtime_gather_output: Optional[bool] = None, *, inference_params: Optional[BaseInferenceContext] = None, - loss_mask: Optional[Tensor] = None, packed_seq_params: Optional[PackedSeqParams] = None, padding_mask: Optional[Tensor] = None, ) -> Tensor: @@ -303,39 +258,13 @@ def forward( padding_mask=padding_mask, ) - output_weight = None - if self.share_embeddings_and_output_weights: - output_weight = self.shared_embedding_or_output_weight() - - if self.mtp_process: - hidden_states = self.mtp( - input_ids=input_ids, - position_ids=position_ids, - hidden_states=hidden_states, - attention_mask=attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb, - packed_seq_params=packed_seq_params, - embedding=self.embedding, - ) - if not self.post_process: return hidden_states - if self.config.mtp_num_layers is not None: - hidden_states = process_mtp_loss( - hidden_states=hidden_states, - labels=labels, - loss_mask=loss_mask, - output_layer=self.output_layer, - output_weight=output_weight, - runtime_gather_output=runtime_gather_output, - is_training=self.training, - compute_language_model_loss=self.compute_language_model_loss, - config=self.config, - cp_group=self.pg_collection.cp, - packed_seq_params=packed_seq_params, - ) + # logits and loss + output_weight = None + if self.share_embeddings_and_output_weights: + output_weight = self.shared_embedding_or_output_weight() sequence_parallel_override = False if in_inference_mode and inference_context.materialize_only_last_token_logits: @@ -352,7 +281,7 @@ def forward( self.output_layer.sequence_parallel = False sequence_parallel_override = True - # Reshape [B, 1, H] to [1, B, H] → extract each sample's true last‐token hidden + # Reshape [B, 1, H] to [1, B, H] → extract each sample’s true last‐token hidden # state ([B, H]) → unsqueeze back to [B, 1, H] # (so that the output layer, which expects S×B×H, receives only the final token) hidden_states = inference_context.last_token_logits( diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 15c5adfc7a2..edca62be375 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -212,10 +212,7 @@ def set_current_microbatch(model, microbatch_id): layer.current_microbatch = microbatch_id if hasattr(model_with_decoder, 'mtp'): for layer in model_with_decoder.mtp.layers: - assert hasattr( - layer, 'mtp_model_layer' - ), f"MTP layer {layer} must have 'mtp_model_layer' attribute" - layer.mtp_model_layer.current_microbatch = microbatch_id + layer.transformer_layer.current_microbatch = microbatch_id def forward_step_calc_loss( diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index ffb7b8f6fdb..ef41faae143 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -42,7 +42,6 @@ class MambaStackSubmodules: attention_layer: Union[ModuleSpec, type] = IdentityOp mlp_layer: Union[ModuleSpec, type] = IdentityOp moe_layer: Union[ModuleSpec, type] = IdentityOp - mtp_block_spec: Optional[ModuleSpec] = None class MambaStack(MegatronModule): @@ -86,14 +85,12 @@ def __init__( device=None, dtype=None, pg_collection: ProcessGroupCollection = None, - is_mtp_layer: bool = False, ) -> None: super().__init__(config=config) self.residual_in_fp32 = residual_in_fp32 self.pre_process = pre_process self.post_layer_norm = post_layer_norm self.post_process = post_process - self.is_mtp_layer = is_mtp_layer assert pg_collection is not None, "pg_collection must be provided for MambaStack" @@ -106,32 +103,20 @@ def __init__( self.hybrid_attention_ratio = hybrid_attention_ratio self.hybrid_mlp_ratio = hybrid_mlp_ratio self.hybrid_override_pattern = hybrid_override_pattern - self.pg_collection = pg_collection - - # For MTP layers, always use pattern length (config.num_layers is for main decoder) - if self.is_mtp_layer: - num_layers_for_allocation = len(self.hybrid_override_pattern) - else: - num_layers_for_allocation = ( - self.config.num_layers - if self.config.num_layers is not None - else len(self.hybrid_override_pattern) - ) self.layer_type_list = allocate_layers( - num_layers_for_allocation, + self.config.num_layers, self.hybrid_attention_ratio, self.hybrid_mlp_ratio, self.hybrid_override_pattern, - silent=self.is_mtp_layer, ) pp_layer_offset = 0 - if self.pp_group.size() > 1 and not self.is_mtp_layer: + if self.pp_group.size() > 1: pp_layer_offset, self.layer_type_list = self._select_layers_for_pipeline_parallel( self.layer_type_list ) - # Build main decoder layers using shared layer builder + self.layers = nn.ModuleList() for i, layer_type in enumerate(self.layer_type_list): fp8_init_context = get_fp8_context(self.config, i + pp_layer_offset, is_init=True) @@ -152,10 +137,9 @@ def __init__( config=self.config, layer_number=i + 1, pg_collection=pg_collection, - is_mtp_layer=is_mtp_layer, ) elif layer_type == LayerSymbols.MLP: - # MLP layers apply their own pp_layer_offset + # Transformer layers apply their own pp_layer_offset layer = build_module( submodules.mlp_layer, config=self.config, @@ -163,7 +147,7 @@ def __init__( pg_collection=pg_collection, ) elif layer_type == LayerSymbols.MOE: - # MoE layers apply their own pp_layer_offset + # Transformer layers apply their own pp_layer_offset layer = build_module( submodules.moe_layer, config=self.config, @@ -332,7 +316,7 @@ def forward( # Ensure that the tensor passed between pipeline parallel stages is # viewless. See related notes in TransformerBlock and TransformerLayer - hidden_states = make_viewless_tensor( + output = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py index d7002b2915d..fe997e2249a 100644 --- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -1,8 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import logging -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Tuple if __name__ != "__main__": from megatron.core.utils import log_single_rank @@ -30,129 +29,9 @@ class Symbols: ATTENTION = "*" MLP = "-" MOE = 'E' - MTP_SEPARATOR = "/" VALID = {MAMBA, ATTENTION, MLP, MOE} -@dataclass -class ParsedHybridPattern: - """Result of parsing a unified hybrid pattern string. - - A unified pattern encodes both the main decoder pattern and the MTP pattern - in a single string using "/" as a separator. - - Format: "///..." - - Examples: - - "M*M*" -> main="M*M*", mtp=None, depths=0 (no MTP) - - "M*M*/MM/MM" -> main="M*M*", mtp="MM", depths=2 - - "MMMM/*M/*M/*M" -> main="MMMM", mtp="*M", depths=3 - - The "/" symbol introduces MTP patterns. Each repeated pattern after the main - decoder represents one MTP prediction depth. - - Attributes: - main_pattern: The main decoder layer pattern (e.g., "M*M*") - mtp_pattern: The MTP layer pattern per depth (e.g., "MM"), or None if no MTP - mtp_num_depths: Number of MTP prediction depths (0 if no MTP) - """ - - main_pattern: Optional[str] - mtp_pattern: Optional[str] - mtp_num_depths: int - - -def parse_hybrid_pattern(pattern: Optional[str]) -> ParsedHybridPattern: - """Parse a unified hybrid pattern string into main and MTP components. - - The pattern uses "/" as a separator between the main decoder pattern and - MTP patterns. Each MTP pattern after the separator represents one prediction - depth. - - Format: "///..." - - Args: - pattern: Unified pattern string, e.g., "M*M*/MM/MM" or just "M*M*" - - Returns: - ParsedHybridPattern with main_pattern, mtp_pattern, and mtp_num_depths - - Raises: - ValueError: If MTP patterns are inconsistent (all must be identical) - ValueError: If pattern contains invalid layer symbols - - Examples: - >>> parse_hybrid_pattern("M*M*") - ParsedHybridPattern(main_pattern="M*M*", mtp_pattern=None, mtp_num_depths=0) - - >>> parse_hybrid_pattern("M*M*/MM/MM") - ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) - - >>> parse_hybrid_pattern("MMMM/*M/*M/*M") - ParsedHybridPattern(main_pattern="MMMM", mtp_pattern="*M", mtp_num_depths=3) - """ - if pattern is None: - return ParsedHybridPattern(main_pattern=None, mtp_pattern=None, mtp_num_depths=0) - - parts = pattern.split(Symbols.MTP_SEPARATOR) - - if len(parts) == 1: - # No MTP separator found - pattern is main decoder only - main_pattern = parts[0] - _validate_pattern(main_pattern, "main") - return ParsedHybridPattern(main_pattern=main_pattern, mtp_pattern=None, mtp_num_depths=0) - - # First part is main decoder pattern - main_pattern = parts[0] - if main_pattern: - _validate_pattern(main_pattern, "main") - - # Remaining parts are MTP patterns (one per depth) - mtp_parts = parts[1:] - - if not mtp_parts or all(p == "" for p in mtp_parts): - # No MTP patterns after separator - return ParsedHybridPattern( - main_pattern=main_pattern if main_pattern else None, mtp_pattern=None, mtp_num_depths=0 - ) - - # Validate all MTP patterns are identical - mtp_pattern = mtp_parts[0] - for i, part in enumerate(mtp_parts[1:], start=2): - if part != mtp_pattern: - raise ValueError( - f"All MTP patterns must be identical. " - f"Pattern 1 is '{mtp_pattern}', but pattern {i} is '{part}'. " - f"Full pattern: '{pattern}'" - ) - - _validate_pattern(mtp_pattern, "MTP") - - return ParsedHybridPattern( - main_pattern=main_pattern if main_pattern else None, - mtp_pattern=mtp_pattern, - mtp_num_depths=len(mtp_parts), - ) - - -def _validate_pattern(pattern: str, pattern_name: str) -> None: - """Validate that a pattern contains only valid layer symbols. - - Args: - pattern: Layer pattern string to validate - pattern_name: Name of pattern for error messages (e.g., "main" or "MTP") - - Raises: - ValueError: If pattern contains invalid symbols - """ - for char in pattern: - if char not in Symbols.VALID: - raise ValueError( - f"In {pattern_name} pattern, '{char}' is not a valid layer symbol. " - f"Valid symbols are: {Symbols.VALID}" - ) - - def _allocate_auto( total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float ) -> list: @@ -218,21 +97,19 @@ def allocate_layers( target_attention_ratio: float, target_mlp_ratio: float, override_pattern: str = None, - silent: bool = False, ) -> list: """Allocates layers according to the requested distribution of layer types.""" assert total_layers_count > 0 assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0 assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0 assert target_attention_ratio + target_mlp_ratio <= 1.0 - maybe_log_single_rank = (lambda *args, **kwargs: None) if silent else log_single_rank # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio) if override_pattern is not None: layer_type_list_override = _allocate_override(total_layers_count, override_pattern) - maybe_log_single_rank(logger, logging.INFO, "Using hybrid override pattern") + log_single_rank(logger, logging.INFO, "Using hybrid override pattern") if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match( layer_type_list_override, layer_type_list ): @@ -242,15 +119,13 @@ def allocate_layers( "pattern." ) if layer_type_list_override == layer_type_list: - maybe_log_single_rank( + log_single_rank( logger, logging.INFO, "The override pattern matches the overridden pattern" ) else: - maybe_log_single_rank( - logger, logging.INFO, "Warning: overriding pattern A with pattern B" - ) - maybe_log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") - maybe_log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") + log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B") + log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") + log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") layer_type_list = layer_type_list_override if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None: @@ -259,32 +134,32 @@ def allocate_layers( actual_mlp_layers_count = layer_type_list.count(Symbols.MLP) actual_mlp_ratio = actual_mlp_layers_count / total_layers_count allocation_string = "".join(layer_type_list) - maybe_log_single_rank( + log_single_rank( logger, logging.INFO, f"Hybrid allocation ({Symbols.MAMBA} is mamba, " f"{Symbols.ATTENTION} is attention, " f"{Symbols.MLP} is mlp):", ) - maybe_log_single_rank(logger, logging.INFO, allocation_string) - maybe_log_single_rank( + log_single_rank(logger, logging.INFO, allocation_string) + log_single_rank( logger, logging.INFO, f"{actual_attention_layers_count} attention layers in " f"{total_layers_count} total layers.", ) - maybe_log_single_rank( + log_single_rank( logger, logging.INFO, f"Target attention ratio: {target_attention_ratio:.2f}. " f"Actual attention ratio: {actual_attention_ratio:.2f}.", ) - maybe_log_single_rank( + log_single_rank( logger, logging.INFO, f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.", ) - maybe_log_single_rank( + log_single_rank( logger, logging.INFO, f"Target mlp ratio: {target_mlp_ratio:.2f}. " diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index e5decdfb29b..5cfea1e8ae4 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -87,12 +87,10 @@ def __init__( config: TransformerConfig, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, - is_mtp_layer: bool = False, ): super(BaseMoELayer, self).__init__(config) self.config = config self.layer_number = layer_number - self.is_mtp_layer = is_mtp_layer self.ep_group = pg_collection.ep # use pg_collection.expt_tp_group as tensor parallel group in this module. self.attn_tp_group = pg_collection.tp @@ -142,7 +140,6 @@ def __init__( submodules: Optional[MoESubmodules] = None, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, - is_mtp_layer: bool = False, ): self.submodules = submodules # TODO(Hepteract): delete the usage of the global parallel_state. @@ -150,10 +147,7 @@ def __init__( if pg_collection is None: pg_collection = get_default_pg_collection() super(MoELayer, self).__init__( - config=config, - layer_number=layer_number, - pg_collection=pg_collection, - is_mtp_layer=is_mtp_layer, + config=config, layer_number=layer_number, pg_collection=pg_collection ) # If using mcore cudagraphs, recompute is handled by transformer_layer.MoETransformerLayer self.moe_layer_recompute = ( @@ -169,9 +163,7 @@ def __init__( self.tp_group = pg_collection.tp # Initialize router. - self.router = submodules.router( - config=self.config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer - ) + self.router = submodules.router(config=self.config, pg_collection=pg_collection) self.tp_group = pg_collection.tp # Initialize latent projections. diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index e42fd1ca8aa..4be97401748 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -29,10 +29,7 @@ class Router(ABC, MegatronModule): """Base Router class""" def __init__( - self, - config: TransformerConfig, - pg_collection: Optional[ProcessGroupCollection] = None, - is_mtp_layer: bool = False, + self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None ) -> None: """ Initialize the Router module. @@ -40,14 +37,12 @@ def __init__( Args: config (TransformerConfig): Configuration object for the Transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. - is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ super().__init__(config) self.config = config self.num_experts = self.config.num_moe_experts self.moe_aux_loss_func = None self.layer_number = None - self.is_mtp_layer = is_mtp_layer self.tp_group = pg_collection.tp self.cp_group = pg_collection.cp self.tp_cp_group = pg_collection.tp_cp @@ -150,19 +145,15 @@ class TopKRouter(Router): """ def __init__( - self, - config: TransformerConfig, - pg_collection: Optional[ProcessGroupCollection] = None, - is_mtp_layer: bool = False, + self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None ) -> None: """Initialize the zero token dropping router. Args: config (TransformerConfig): The configuration for the transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. - is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ - super().__init__(config=config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer) + super().__init__(config=config, pg_collection=pg_collection) self.topk = self.config.moe_router_topk self.routing_type = self.config.moe_router_load_balancing_type self.score_function = self.config.moe_router_score_function @@ -447,16 +438,6 @@ def attach_and_log_load_balancing_loss( padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). If None, uses activation.shape[0]. Defaults to None. """ - # When using repeated MTP layers, the loss is counted "mtp_num_layers" times. - # To avoid accumulating the load balancing loss multiple times, we scale it by - # 1/mtp_num_layers so the total loss is correct. - if ( - self.is_mtp_layer - and self.config.mtp_use_repeated_layer - and self.config.mtp_num_layers is not None - ): - aux_loss = aux_loss / self.config.mtp_num_layers - # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly # add the aux loss logging value to other layer's since it is difficult to get the # correct layer_number for MTP. It does not affect the correctness of the calculation @@ -464,16 +445,10 @@ def attach_and_log_load_balancing_loss( num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers - - if self.is_mtp_layer: - layer_number = self.layer_number + self.config.num_layers - else: - layer_number = self.layer_number - save_to_aux_losses_tracker( aux_loss_name, aux_loss / aux_loss_coeff, - layer_number, + self.layer_number, num_layers, reduce_group=reduce_group, reduce_group_has_dp=reduce_group_has_dp, @@ -524,27 +499,11 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) - # When using repeated MTP layers, the same MTP layer is called mtp_num_layers times. - # To avoid accumulating the z_loss multiple times, we scale it by 1/mtp_num_layers - # so the total loss is correct. - if ( - self.is_mtp_layer - and self.config.mtp_use_repeated_layer - and self.config.mtp_num_layers is not None - ): - z_loss = z_loss / self.config.mtp_num_layers - num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers - - if self.is_mtp_layer: - layer_number = self.layer_number + self.config.num_layers - else: - layer_number = self.layer_number - save_to_aux_losses_tracker( - "z_loss", z_loss / moe_z_loss_coeff, layer_number, num_layers + "z_loss", z_loss / moe_z_loss_coeff, self.layer_number, num_layers ) return logits diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 6432af36cde..2edb652bfc6 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -14,7 +14,6 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( gather_from_tensor_model_parallel_region, @@ -25,6 +24,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_block import TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import get_transformer_layer_offset from megatron.core.utils import ( get_pg_rank, is_torch_min_version, @@ -369,7 +369,7 @@ def track_mtp_metrics(loss_scale, iteration, writer, wandb_writer=None, total_lo mtp_losses = tracker["values"] * loss_scale mtp_num_layers = mtp_losses.shape[0] for i in range(mtp_num_layers): - name = f"mtp_{i + 1} loss" + name = f"mtp_{i+1} loss" loss = mtp_losses[i] if total_loss_dict is not None: if name in total_loss_dict: @@ -396,19 +396,19 @@ class MultiTokenPredictionLayerSubmodules: embedding normalization to be applied. eh_proj (Union[ModuleSpec, type]): Specification or instance of the linear projection to be applied. - mtp_model_layer (Union[ModuleSpec, type]): Specification - or instance of the transformer or mamba block to be applied. + transformer_layer (Union[ModuleSpec, type]): Specification + or instance of the transformer block to be applied. """ enorm: Union[ModuleSpec, type] = None hnorm: Union[ModuleSpec, type] = None eh_proj: Union[ModuleSpec, type] = None - mtp_model_layer: Union[ModuleSpec, type] = None + transformer_layer: Union[ModuleSpec, type] = None layer_norm: Union[ModuleSpec, type] = None def get_mtp_layer_spec( - mtp_model_layer_spec: ModuleSpec, use_transformer_engine: bool + transformer_layer_spec: ModuleSpec, use_transformer_engine: bool ) -> ModuleSpec: """Get the MTP layer spec. @@ -416,13 +416,13 @@ def get_mtp_layer_spec( ModuleSpec: Module specification with TE modules """ return get_mtp_layer_spec_for_backend( - mtp_model_layer_spec, + transformer_layer_spec, backend=TESpecProvider() if use_transformer_engine else LocalSpecProvider(), ) def get_mtp_layer_spec_for_backend( - mtp_model_layer_spec: ModuleSpec, backend: BackendSpecProvider + transformer_layer_spec: ModuleSpec, backend: BackendSpecProvider ) -> ModuleSpec: """Get the MTP layer spec. @@ -437,7 +437,7 @@ def get_mtp_layer_spec_for_backend( enorm=layer_norm_impl, hnorm=layer_norm_impl, eh_proj=column_parallel_linear_impl, - mtp_model_layer=mtp_model_layer_spec, + transformer_layer=transformer_layer_spec, layer_norm=layer_norm_impl, ), ) @@ -586,79 +586,6 @@ def set_loss_scale(scale: torch.Tensor): MTPLossAutoScaler.main_loss_backward_scale = scale -def process_mtp_loss( - hidden_states: Tensor, - labels: Tensor, - loss_mask: Optional[Tensor], - output_layer: Callable, - output_weight: Optional[Tensor], - runtime_gather_output: Optional[bool], - is_training: bool, - compute_language_model_loss: Callable, - config: TransformerConfig, - cp_group: Optional[torch.distributed.ProcessGroup] = None, - packed_seq_params: Optional[PackedSeqParams] = None, -) -> Tensor: - """Process Multi-Token Prediction (MTP) loss computation. - - This is a standalone function that handles MTP loss computation. It's used on the - post_process rank to split concatenated hidden states and compute MTP losses. - - Args: - hidden_states (Tensor): Hidden states tensor (concatenated with MTP outputs). - labels (Tensor): Ground truth labels. - loss_mask (Optional[Tensor]): Mask for loss computation. If None, uses all ones. - output_layer (Callable): Output layer method to compute logits. - output_weight (Optional[Tensor]): Optional output weight for shared embeddings. - runtime_gather_output (Optional[bool]): Whether to gather output at runtime. - is_training (bool): Whether the model is in training mode. - compute_language_model_loss (Callable): Method to compute language model loss. - config (TransformerConfig): Model configuration containing mtp_num_layers etc. - cp_group (Optional[ProcessGroup]): Context parallelism process group. - packed_seq_params (Optional[PackedSeqParams]): Packed sequence parameters. - - Returns: - Tensor: Updated hidden states after MTP loss processing (first chunk only). - """ - mtp_labels = labels.clone() - hidden_states_list = torch.chunk(hidden_states, 1 + config.mtp_num_layers, dim=0) - hidden_states = hidden_states_list[0] - - if loss_mask is None: - loss_mask = torch.ones_like(mtp_labels) - - for mtp_layer_number in range(config.mtp_num_layers): - mtp_logits, _ = output_layer( - hidden_states_list[mtp_layer_number + 1], - weight=output_weight, - runtime_gather_output=runtime_gather_output, - ) - mtp_labels, _ = roll_tensor( - mtp_labels, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params - ) - loss_mask, num_tokens = roll_tensor( - loss_mask, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params - ) - mtp_loss = compute_language_model_loss(mtp_labels, mtp_logits) - mtp_loss = loss_mask * mtp_loss - if is_training: - MTPLossLoggingHelper.save_loss_to_tracker( - torch.sum(mtp_loss) / num_tokens, - mtp_layer_number, - config.mtp_num_layers, - avg_group=parallel_state.get_data_parallel_group(with_context_parallel=True), - ) - mtp_loss_scale = config.mtp_loss_scaling_factor / config.mtp_num_layers - if config.calculate_per_token_loss: - hidden_states = MTPLossAutoScaler.apply(hidden_states, mtp_loss_scale * mtp_loss) - else: - hidden_states = MTPLossAutoScaler.apply( - hidden_states, mtp_loss_scale * mtp_loss / num_tokens - ) - - return hidden_states - - class MultiTokenPredictionLayer(MegatronModule): """The implementation for Multi-Token Prediction (MTP) which extends the prediction scope to multiple future tokens at each position. @@ -686,9 +613,6 @@ def __init__( layer_number: int = 1, vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, - # For Mamba path - pattern and submodules to build inner layers directly - mtp_layer_pattern: Optional[str] = None, - mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.sequence_parallel = config.sequence_parallel @@ -696,31 +620,14 @@ def __init__( self.layer_number = layer_number + get_mtp_layer_offset(self.config, vp_stage) self.vp_stage = vp_stage self.cp_group = pg_collection.cp - self.mtp_layer_pattern = mtp_layer_pattern - # Validate attention mask type if using transformer-based inner layers - if self.submodules.mtp_model_layer is not None and hasattr( - self.submodules.mtp_model_layer, 'submodules' - ): - if hasattr(self.submodules.mtp_model_layer.submodules, 'attention_layer'): - self_attention_spec = self.submodules.mtp_model_layer.submodules.attention_layer - if self_attention_spec.submodules.self_attention is not None: - self_attention_spec = self_attention_spec.submodules.self_attention - attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') - assert attn_mask_type in SUPPORTED_ATTN_MASK, ( - f"Multi-Token Prediction (MTP) is not yet supported with " - f"{attn_mask_type} attention mask type. " - f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." - ) - elif hasattr(self.submodules.mtp_model_layer.submodules, 'self_attention'): - self_attention_spec = self.submodules.mtp_model_layer.submodules.self_attention - if self_attention_spec is not None: - attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') - assert attn_mask_type in SUPPORTED_ATTN_MASK, ( - f"Multi-Token Prediction (MTP) is not yet supported with " - f"{attn_mask_type} attention mask type. " - f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." - ) + self_attention_spec = self.submodules.transformer_layer.submodules.self_attention + attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') + assert attn_mask_type in SUPPORTED_ATTN_MASK, ( + f"Multi-Token Prediction (MTP) is not jet supported with " + + f"{attn_mask_type} attention mask type." + + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." + ) self.enorm = build_module( self.submodules.enorm, @@ -751,37 +658,17 @@ def __init__( bias=False, skip_bias_add=False, is_expert=False, - tp_comm_buffer_name="mtp_eh_proj", ) - # Build inner layers: two possible paths - # 1. Mamba path: use MambaStack for hybrid pattern support - # 2. GPT path: single TransformerLayer - if mtp_layer_pattern is not None and mamba_submodules is not None: - from megatron.core.ssm.mamba_block import MambaStack - - self.mtp_model_layer = MambaStack( - config=self.config, - submodules=mamba_submodules, - hybrid_override_pattern=mtp_layer_pattern, - pre_process=True, # Always receives input from eh_proj - post_layer_norm=False, # MTP has its own final_layernorm - post_process=True, # MTP layer is self-contained - pg_collection=pg_collection, - is_mtp_layer=True, - ) - elif self.config.mtp_num_layers is not None: - # GPT path: Uses the transformer block spec for MTP layer - # MTP inner layers use their own layer numbering (self.layer_number = 1, 2, etc.) - # rather than continuing from decoder layer numbers. This is consistent with the - # Mamba path and ensures proper aux loss tracking in router.py. - self.mtp_model_layer = build_module( - self.submodules.mtp_model_layer, - config=self.config, - vp_stage=self.vp_stage, - layer_number=self.layer_number, - is_mtp_layer=True, - ) + diff_transformer_layer_offset = self.config.num_layers - get_transformer_layer_offset( + self.config, vp_stage + ) + self.transformer_layer = build_module( + self.submodules.transformer_layer, + config=self.config, + vp_stage=vp_stage, + layer_number=self.layer_number + diff_transformer_layer_offset, + ) self.final_layernorm = build_module( self.submodules.layer_norm, @@ -892,6 +779,7 @@ def _proj_and_transformer_layer( transformer_layer_fp8_context = nullcontext() # TODO: currently ignoring FP4 in MTP layers because we need more numerical validation + with rng_context: with fp8_context: hidden_states = self._concat_embeddings(hidden_states, decoder_input) @@ -900,29 +788,19 @@ def _proj_and_transformer_layer( # transformer layer is cudagraphed, the FP8GlobalStateManager.is_first_fp8_module() is # True so that the fp8 weight caching can be triggered correctly. with transformer_layer_fp8_context: - if self.mtp_layer_pattern is not None: - hidden_states = self.mtp_model_layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, - inference_context=inference_params, - packed_seq_params=packed_seq_params, - ) - else: - # GPT path: single TransformerLayer - hidden_states, _ = self.mtp_model_layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - context=context, - context_mask=context_mask, - rotary_pos_emb=rotary_pos_emb, - rotary_pos_cos=rotary_pos_cos, - rotary_pos_sin=rotary_pos_sin, - attention_bias=attention_bias, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - sequence_len_offset=sequence_len_offset, - ) + hidden_states, _ = self.transformer_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + attention_bias=attention_bias, + inference_params=inference_params, + packed_seq_params=packed_seq_params, + sequence_len_offset=sequence_len_offset, + ) hidden_states = self._postprocess(hidden_states) @@ -1019,7 +897,8 @@ def forward( Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape [s, b, h], and optionally the updated context tensor if cross-attention is used. """ - assert context is None, "multi token prediction + cross attention is not yet supported." + assert context is None, f"multi token prediction + cross attention is not yet supported." + input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings( input_ids=input_ids, position_ids=position_ids, @@ -1028,15 +907,6 @@ def forward( packed_seq_params=packed_seq_params, ) - # Roll RoPE to match rolled positions (position_ids were rolled in _get_embeddings) - # After rolling, index i should use RoPE for position i+1 - if rotary_pos_emb is not None: - rotary_pos_emb = torch.roll(rotary_pos_emb, shifts=-1, dims=0) - if rotary_pos_cos is not None: - rotary_pos_cos = torch.roll(rotary_pos_cos, shifts=-1, dims=0) - if rotary_pos_sin is not None: - rotary_pos_sin = torch.roll(rotary_pos_sin, shifts=-1, dims=0) - if self.config.recompute_granularity == 'full' and self.training: hidden_states = self._checkpointed_forward( self._proj_and_transformer_layer, @@ -1152,9 +1022,6 @@ class MultiTokenPredictionBlock(MegatronModule): the linear projection. The combined serves as the input of the Transformer block at the k-th depth to produce the output representation. - When `mtp_use_repeated_layer=True` in config, instead of creating N separate MTP layers, - only 1 layer is created and applied mtp_num_layers times. - for more information, please refer to DeepSeek-V3 Technical Report https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf """ @@ -1165,26 +1032,11 @@ def __init__( spec: Union[TransformerBlockSubmodules, ModuleSpec], vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, - # New: For Mamba path with unified pattern syntax - mtp_layer_pattern: Optional[str] = None, - mtp_num_depths: int = 0, - mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.submodules = _get_mtp_block_submodules(config, spec) self.mtp_loss_scaling_factor = config.mtp_loss_scaling_factor self.vp_stage = vp_stage - self.mtp_layer_pattern = mtp_layer_pattern - self.mtp_num_depths = mtp_num_depths - self.mamba_submodules = mamba_submodules - self.mtp_use_repeated_layer = self.config.mtp_use_repeated_layer - - vp_size = config.virtual_pipeline_model_parallel_size - assert is_vp_last_stage(vp_stage=vp_stage, vp_size=vp_size), ( - f"MTP layers must be placed on the last virtual pipeline stage. " - f"Got vp_stage={vp_stage} with vp_size={vp_size}. " - f"Placing MTP layers on different VPP stages is not currently supported." - ) # Initialize Context Parallelism (CP) support for MTP # This enables MTP to work with CP > 1 by providing the CP process group @@ -1203,14 +1055,7 @@ def __init__( self.cp_group = pg_collection.cp def _build_layers(self, pg_collection): - # Determine number of depths to build - if self.mtp_num_depths > 0: - num_depths = self.mtp_num_depths - else: - num_depths = self.config.mtp_num_layers or len(self.submodules.layer_specs) - - def build_layer_legacy(layer_spec, layer_number): - """Build layer using legacy spec-based approach.""" + def build_layer(layer_spec, layer_number): fp8_init_context = get_fp8_context(self.config, is_init=True) with fp8_init_context: module = build_module( @@ -1219,71 +1064,15 @@ def build_layer_legacy(layer_spec, layer_number): layer_number=layer_number, vp_stage=self.vp_stage, pg_collection=pg_collection, - mtp_layer_pattern=self.mtp_layer_pattern, ) return module - def build_layer_with_pattern(layer_spec, layer_number, mtp_layer_pattern, mamba_submodules): - """Build layer using pattern-based approach (new Mamba path).""" - fp8_init_context = get_fp8_context(self.config, is_init=True) - with fp8_init_context: - module = build_module( - layer_spec, - config=self.config, - layer_number=layer_number, - vp_stage=self.vp_stage, - pg_collection=pg_collection, - mtp_layer_pattern=mtp_layer_pattern, - mamba_submodules=mamba_submodules, - ) - return module - - # New Mamba path: use mtp_layer_pattern and mamba_submodules - if self.mtp_layer_pattern is not None and self.mamba_submodules is not None: - if self.mtp_use_repeated_layer: - # Shared/repeated layer: build one layer, use it for all depths - layer_spec = self.submodules.layer_specs[0] - shared_layer = build_layer_with_pattern( - layer_spec, - layer_number=1, - mtp_layer_pattern=self.mtp_layer_pattern, - mamba_submodules=self.mamba_submodules, - ) - self.layers = torch.nn.ModuleList([shared_layer]) - else: - # Non-shared: each depth gets its own layers - self.layers = torch.nn.ModuleList( - [ - build_layer_with_pattern( - self.submodules.layer_specs[ - min(i, len(self.submodules.layer_specs) - 1) - ], - layer_number=i + 1, - mtp_layer_pattern=self.mtp_layer_pattern, - mamba_submodules=self.mamba_submodules, - ) - for i in range(num_depths) - ] - ) - elif self.mtp_use_repeated_layer: - # Legacy repeated layer mode - if len(self.submodules.layer_specs) != 1: - warnings.warn( - "Repeated MTP mode expects exactly 1 layer spec, got " - f"{len(self.submodules.layer_specs)} instead. " - f"The first layer will be applied {self.config.mtp_num_layers} times." - ) - self.layers = torch.nn.ModuleList( - [build_layer_legacy(self.submodules.layer_specs[0], layer_number=1)] - ) - else: - # Legacy mode: build from layer_specs - self.layers = torch.nn.ModuleList( - [ - build_layer_legacy(layer_spec, i + 1) - for i, layer_spec in enumerate(self.submodules.layer_specs) - ] - ) + self.layers = torch.nn.ModuleList( + [ + build_layer(layer_spec, i + 1) + for i, layer_spec in enumerate(self.submodules.layer_specs) + ] + ) def forward( self, @@ -1319,9 +1108,8 @@ def forward( offset = get_mtp_layer_offset(self.config, self.vp_stage) hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] - for iteration in range(self.config.mtp_num_layers): - layer_idx = 0 if self.mtp_use_repeated_layer else iteration - (hidden_states, input_ids, position_ids) = self.layers[layer_idx]( + for layer_number in range(len(self.layers)): + (hidden_states, input_ids, position_ids) = self.layers[layer_number]( input_ids=input_ids, position_ids=position_ids, hidden_states=hidden_states, @@ -1363,7 +1151,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' for layer in self.layers: offset = get_mtp_layer_offset(self.config, self.vp_stage) - sharded_prefix = f'{layer_prefix}{layer.layer_number - 1}.' + sharded_prefix = f'{layer_prefix}{layer.layer_number - 1 }.' state_dict_prefix = f'{layer_prefix}{layer.layer_number - 1 - offset}.' sharded_pp_offset = [] diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 96f7e9b8b95..eaae585905e 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -59,15 +59,6 @@ class TransformerConfig(ModelParallelConfig): which serves as an additional training objective. """ - mtp_use_repeated_layer: bool = False - """Use a single MTP layer repeatedly instead of multiple separate layers.""" - - mtp_hybrid_override_pattern: Optional[str] = None - """DEPRECATED: Use unified hybrid_override_pattern instead. - Legacy argument for loading old checkpoints. - Force a specific hybrid layer pattern for MTP layers. - """ - num_layers_in_first_pipeline_stage: Optional[int] = None """Number of transformer layers on first pipeline stage. None implies equal layer division across PP ranks.""" diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index ae505f04fc6..a5eaec92866 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -268,7 +268,6 @@ def __init__( hidden_dropout: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, vp_stage: Optional[int] = None, - is_mtp_layer: bool = False, ): self.submodules_config = submodules super().__init__(config=config, vp_stage=vp_stage) @@ -278,18 +277,10 @@ def __init__( self.pg_collection = pg_collection self.tp_group = pg_collection.tp - # MTP inner layers use their own layer numbering (starting from 1 within each MTP depth), - # so they should NOT add the decoder layer offset. The router.py handles MTP layer - # numbering separately by adding config.num_layers to distinguish MTP layers from decoder - # layers in the aux loss tracker. - if is_mtp_layer: - self.layer_number = layer_number - else: - self.layer_number = layer_number + get_transformer_layer_offset( - self.config, vp_stage, get_pg_rank(pg_collection.pp) - ) + self.layer_number = layer_number + get_transformer_layer_offset( + self.config, vp_stage, get_pg_rank(pg_collection.pp) + ) self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout - self.is_mtp_layer = is_mtp_layer # [Module 1: Input Layernorm] Optional Layernorm on the input data # TODO: add pytorch only layernorm @@ -360,9 +351,6 @@ def __init__( if isinstance(submodules.mlp, ModuleSpec): if submodules.mlp.module in (MoELayer, GroupedMLP, TEGroupedMLP, SequentialMLP): additional_mlp_kwargs["pg_collection"] = pg_collection - # Pass is_mtp_layer flag to MoELayer to distinguish MTP MoE layers. - if submodules.mlp.module == MoELayer: - additional_mlp_kwargs["is_mtp_layer"] = self.is_mtp_layer elif submodules.mlp.module == MLP: assert hasattr( pg_collection, 'tp' diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 4e1c3bc6f17..5749d20a4ca 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -499,79 +499,6 @@ def validate_args(args, defaults={}): print_rank_0('setting global batch size to {}'.format(args.global_batch_size)) assert args.global_batch_size > 0 - # === MTP validation === - # Deprecation warnings for legacy MTP arguments - if args.mtp_hybrid_override_pattern is not None: - warn_rank_0( - "--mtp-hybrid-override-pattern is deprecated. " - "For new hybrid models with MTP models, use unified --hybrid-override-pattern instead. " - "Example: 'M*M*/MM/MM' means main='M*M*', MTP pattern='MM' with 2 depths. " - "This argument is kept only for loading old checkpoints.", - args.rank, - ) - - # Backward compatibility: convert legacy mtp_hybrid_override_pattern to unified format - from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, parse_hybrid_pattern - sep = Symbols.MTP_SEPARATOR - if ( - getattr(args, 'mtp_hybrid_override_pattern', None) is not None - and args.mtp_num_layers is not None - and args.mtp_num_layers > 0 - and (args.hybrid_override_pattern is None or sep not in args.hybrid_override_pattern) - ): - main_pattern = args.hybrid_override_pattern or '' - mtp_pattern = args.mtp_hybrid_override_pattern - args.hybrid_override_pattern = main_pattern + sep + sep.join([mtp_pattern] * args.mtp_num_layers) - args.mtp_hybrid_override_pattern = None - print_rank_0(f"Converted legacy MTP pattern to unified: {args.hybrid_override_pattern}") - - # Infer mtp_num_layers from unified pattern - if args.hybrid_override_pattern and sep in args.hybrid_override_pattern: - parsed = parse_hybrid_pattern(args.hybrid_override_pattern) - if parsed.mtp_pattern and parsed.mtp_num_depths > 0: - inferred_mtp_num_layers = parsed.mtp_num_depths - if args.mtp_num_layers is None: - args.mtp_num_layers = inferred_mtp_num_layers - elif args.mtp_num_layers != inferred_mtp_num_layers: - warn_rank_0( - f"--mtp-num-layers ({args.mtp_num_layers}) conflicts with " - f"MTP depth count ({inferred_mtp_num_layers}) in pattern '{args.hybrid_override_pattern}'. " - f"Using the inferred value ({inferred_mtp_num_layers}).", - args.rank - ) - args.mtp_num_layers = inferred_mtp_num_layers - - # MTP validation - if args.mtp_num_layers: - assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." - assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( - f"Multi-Token Prediction (MTP) is not supported with {args.position_embedding_type} position embedding type." - + f"The supported position embedding types are rope and none." - ) - - # Validate MTP args for hybrid vs non-hybrid models - if args.is_hybrid_model: - # Mamba/hybrid model MTP validation - if args.mtp_num_layers and not (args.hybrid_override_pattern and sep in args.hybrid_override_pattern): - # Hybrid model wants MTP but no unified pattern - check for legacy args - if args.mtp_hybrid_override_pattern is None: - warn_rank_0( - "Hybrid model with --mtp-num-layers but no MTP pattern. " - "Use unified --hybrid-override-pattern with '/' separator (e.g., 'M*M*/MM/MM') " - "or legacy --mtp-hybrid-override-pattern for old checkpoints.", - args.rank - ) - else: - # Non-hybrid (GPT) model MTP validation - if args.mtp_hybrid_override_pattern is not None: - warn_rank_0( - "--mtp-hybrid-override-pattern is for Mamba/hybrid models only. " - "For GPT models, MTP replicates the main transformer layer structure. " - "This argument will be ignored.", - args.rank - ) - # === End of MTP validation === - # Uneven virtual pipeline parallelism assert ( int(args.num_layers_per_virtual_pipeline_stage is not None) diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index f964b8dd32e..a3d307f1e30 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1418,12 +1418,6 @@ def _set_arg(arg_name, old_arg_name=None, force=False): _set_arg('hidden_dropout', force=True) _set_arg('hybrid_override_pattern', force=True) - - # Legacy MTP pattern for old checkpoints - _set_arg('mtp_hybrid_override_pattern', force=True) - _set_arg('mtp_num_layers', force=True) - _set_arg('mtp_use_repeated_layer', force=True) - _set_arg('spec', force=True) _set_arg('hybrid_attention_ratio', force=True) _set_arg('hybrid_mlp_ratio', force=True) diff --git a/megatron/training/training.py b/megatron/training/training.py index 5206b526e18..87d9fe8b841 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -220,20 +220,10 @@ def num_floating_point_operations(args, batch_size): def calculate_layer_counts(): """Calculate the number of attention, Mamba, and MLP layers.""" if args.hybrid_override_pattern: - from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern - # Parse unified pattern to separate main and MTP components - parsed = parse_hybrid_pattern(args.hybrid_override_pattern) - counts = {'M': 0, '*': 0, '-': 0, 'E': 0} - # Count main decoder layers - if parsed.main_pattern: - for layer_type in parsed.main_pattern: - if layer_type in counts: - counts[layer_type] += 1 - # Count MTP layers (pattern repeated mtp_num_depths times) - if parsed.mtp_pattern and parsed.mtp_num_depths > 0: - for layer_type in parsed.mtp_pattern: - if layer_type in counts: - counts[layer_type] += parsed.mtp_num_depths + counts = {'M': 0, '*': 0, '-': 0, 'E':0} + for layer_type in args.hybrid_override_pattern: + if layer_type in counts: + counts[layer_type] += 1 return counts['*'], counts['M'], counts['-'], counts['E'] else: num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio) @@ -310,7 +300,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, mlp_expansion=4.0, swiglu=False, moe_latent_size=None, moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1, - vocab_size=256000, mtp_num_layers=0): + vocab_size=256000): """Calculate total FLOPs for the hybrid model.""" flops_fwd = ( num_attn_layers * attn_layer_flops(batch_size, seq_len, hidden_size, @@ -323,7 +313,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, shared_expert_ffn_hidden_size, num_experts_routed_to, moe_latent_size, swiglu) + - (2 * batch_size * seq_len * hidden_size * vocab_size * (1 + mtp_num_layers)) # logits computation + (2 * batch_size * seq_len * hidden_size * vocab_size) # logits computation ) return flops_fwd * 3 @@ -614,7 +604,6 @@ def transformer_flops(): else args.moe_shared_expert_intermediate_size), num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, - mtp_num_layers=args.mtp_num_layers, ) else: # Compute standard Transformer model FLOPs. diff --git a/pretrain_mamba.py b/pretrain_mamba.py index c41c485c866..e1379be63e9 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -257,7 +257,6 @@ def forward_step(data_iterator, model: MambaModel): attention_mask, labels=labels, packed_seq_params=packed_seq_params, - loss_mask=loss_mask ) # [ModelOpt]: model is needed to access ModelOpt distillation losses diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index 9c581ec6cb4..3c7ae93a17c 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -193,11 +193,9 @@ "moe_z_loss_coeff": None, "moe_enable_routing_replay": False, "mrope_section": None, - "mtp_hybrid_override_pattern": None, "mtp_loss_scaling_factor": 0.1, "mtp_num_layers": None, "mtp_standalone": False, - "mtp_use_repeated_layer": False, "multi_latent_attention": False, "no_rope_freq": None, "no_sync_func": None, diff --git a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py index 77c106c3bee..77d02c69607 100644 --- a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py +++ b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py @@ -6,12 +6,7 @@ import pytest import torch -from megatron.core.ssm.mamba_hybrid_layer_allocation import ( - ParsedHybridPattern, - Symbols, - allocate_layers, - parse_hybrid_pattern, -) +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, allocate_layers @pytest.mark.internal @@ -80,135 +75,3 @@ def test_wrong_length_override_pattern(self): def test_wrong_number_of_layer_types_in_override_pattern(self): # This override_pattern has too many mlps and not enough attention layer_types = allocate_layers(8, 0.5, 0.25, "M*--M**-") - - -@pytest.mark.internal -class TestParseHybridPattern: - """Tests for parse_hybrid_pattern with unified pattern syntax.""" - - def test_none_pattern(self): - """Test that None pattern returns all None values.""" - result = parse_hybrid_pattern(None) - assert result.main_pattern is None - assert result.mtp_pattern is None - assert result.mtp_num_depths == 0 - - def test_main_pattern_only(self): - """Test patterns without MTP (no / separator).""" - test_cases = [ - ("M*M*", "M*M*"), - ("MMMM", "MMMM"), - ("*M*M", "*M*M"), - ("MM-*", "MM-*"), - ("E", "E"), - ] - for pattern, expected_main in test_cases: - result = parse_hybrid_pattern(pattern) - assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" - assert result.mtp_pattern is None - assert result.mtp_num_depths == 0 - - def test_main_with_single_mtp_depth(self): - """Test patterns with 1 MTP depth.""" - test_cases = [ - ("M*M*/MM", "M*M*", "MM", 1), - ("MMMM/*M", "MMMM", "*M", 1), - ("M/M", "M", "M", 1), - ] - for pattern, expected_main, expected_mtp, expected_depths in test_cases: - result = parse_hybrid_pattern(pattern) - assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" - assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" - assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" - - def test_main_with_multiple_mtp_depths(self): - """Test patterns with multiple MTP depths.""" - test_cases = [ - ("M*M*/MM/MM", "M*M*", "MM", 2), - ("M*M*/MM/MM/MM", "M*M*", "MM", 3), - ("MMMM/*M/*M/*M", "MMMM", "*M", 3), - ("M*/*/*/*", "M*", "*", 3), - ("M/M/M/M/M", "M", "M", 4), - ] - for pattern, expected_main, expected_mtp, expected_depths in test_cases: - result = parse_hybrid_pattern(pattern) - assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" - assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" - assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" - - def test_mtp_patterns_must_be_identical(self): - """Test that mismatched MTP patterns raise ValueError.""" - invalid_patterns = [ - "M*M*/MM/M*", # MM != M* - "M*M*/MM/MM/M", # MM != M - "MMMM/*M/M*", # *M != M* - ] - for pattern in invalid_patterns: - with pytest.raises(ValueError, match="All MTP patterns must be identical"): - parse_hybrid_pattern(pattern) - - def test_invalid_symbols_in_main_pattern(self): - """Test that invalid symbols in main pattern raise ValueError.""" - invalid_patterns = [ - "M*X*", # X is not valid - "MaMM", # a is not valid - "M*M*1", # 1 is not valid - ] - for pattern in invalid_patterns: - with pytest.raises(ValueError, match="not a valid layer symbol"): - parse_hybrid_pattern(pattern) - - def test_invalid_symbols_in_mtp_pattern(self): - """Test that invalid symbols in MTP pattern raise ValueError.""" - # Single MTP depth with invalid symbol - should raise "not a valid layer symbol" - with pytest.raises(ValueError, match="not a valid layer symbol"): - parse_hybrid_pattern("M*M*/MX") # X is not valid - - # Multiple MTP depths with invalid symbol and matching patterns - with pytest.raises(ValueError, match="not a valid layer symbol"): - parse_hybrid_pattern("M*M*/Ma/Ma") # a is not valid - - # Multiple MTP depths with invalid symbol but mismatched patterns - # This raises "All MTP patterns must be identical" before checking symbols - with pytest.raises(ValueError, match="All MTP patterns must be identical"): - parse_hybrid_pattern("M*M*/MM/Ma") - - def test_empty_main_pattern_with_mtp(self): - """Test pattern that starts with / (empty main pattern).""" - result = parse_hybrid_pattern("/MM/MM") - assert result.main_pattern is None - assert result.mtp_pattern == "MM" - assert result.mtp_num_depths == 2 - - def test_trailing_separator(self): - """Test patterns with trailing separator.""" - # "M*M*/" means main="M*M*", one empty MTP pattern - result = parse_hybrid_pattern("M*M*/") - assert result.main_pattern == "M*M*" - # Empty string after separator means no valid MTP pattern - assert result.mtp_pattern is None - assert result.mtp_num_depths == 0 - - def test_complex_patterns(self): - """Test more complex realistic patterns.""" - test_cases = [ - # Main decoder with attention, MTP with mamba only - ("M*M*M*M*/MMM/MMM", "M*M*M*M*", "MMM", 2), - # Main decoder with MLP, MTP with attention+mamba - ("MM-MM-/*M/*M", "MM-MM-", "*M", 2), - # All attention main, mamba MTP - ("*****/M/M/M/M", "*****", "M", 4), - # MoE in main pattern - ("MEME/MM/MM", "MEME", "MM", 2), - ] - for pattern, expected_main, expected_mtp, expected_depths in test_cases: - result = parse_hybrid_pattern(pattern) - assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" - assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" - assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" - - def test_dataclass_equality(self): - """Test that ParsedHybridPattern supports equality comparison.""" - p1 = parse_hybrid_pattern("M*M*/MM/MM") - p2 = ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) - assert p1 == p2 diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py index ec72d713eb1..05fb2c4fe63 100644 --- a/tests/unit_tests/transformer/test_multi_token_prediction.py +++ b/tests/unit_tests/transformer/test_multi_token_prediction.py @@ -13,8 +13,6 @@ get_gpt_mtp_block_spec, ) from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec -from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import get_context_parallel_group @@ -96,7 +94,7 @@ def test_constructor_local(self, tp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].mtp_model_layer is not None + assert mtp.layers[i].transformer_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -122,7 +120,7 @@ def test_constructor_ues_te(self, tp, cp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].mtp_model_layer is not None + assert mtp.layers[i].transformer_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -164,7 +162,7 @@ def model_provider( config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=args.vocab_size, + vocab_size=args.vocal_size, max_sequence_length=args.max_position_embeddings, pre_process=pre_process, post_process=post_process, @@ -188,7 +186,7 @@ def create_test_args( args.num_layers = 2 args.mtp_num_layers = 2 args.mtp_loss_scaling_factor = 0.1 - args.vocab_size = 128800 + args.vocal_size = 128800 args.hidden_size = 128 args.num_attention_heads = 8 args.max_position_embeddings = 256 @@ -679,259 +677,10 @@ def log(self, metrics, iteration): # Verify total_loss_dict is populated for i in range(num_layers): - assert f"mtp_{i + 1} loss" in total_loss_dict - assert total_loss_dict[f"mtp_{i + 1} loss"] == loss * loss_scale + assert f"mtp_{i+1} loss" in total_loss_dict + assert total_loss_dict[f"mtp_{i+1} loss"] == loss * loss_scale # Verify tracker is cleaned assert torch.all(MTPLossLoggingHelper.tracker["values"] == 0) assert MTPLossLoggingHelper.tracker["reduce_group"] is None assert MTPLossLoggingHelper.tracker["avg_group"] is None - - -class TestMultiTokenPredictionMamba: - """Test Multi-Token Prediction with Mamba hybrid models.""" - - def setup_method(self, method): - self.seq_length = 32 - self.micro_batch_size = 2 - os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' - - def teardown_method(self, method): - Utils.destroy_model_parallel() - destroy_global_vars() - destroy_num_microbatches_calculator() - MTPLossLoggingHelper.tracker = {} - - def model_provider(self, pre_process=True, post_process=True, **config_kwargs): - """Model provider for Mamba hybrid models with MTP. - - Uses the unified pattern syntax where MTP is configured via hybrid_override_pattern: - Format: "///..." - Example: "M*M*/M*/M*" = main decoder "M*M*", MTP pattern "M*" with 2 depths - """ - model_parallel_cuda_manual_seed(_SEED) - args = get_args() - config = core_transformer_config_from_args(args) - - # MTP is configured via unified pattern in hybrid_override_pattern - # MambaModel creates the MTP block internally based on the parsed pattern - model = MambaModel( - config=config, - mamba_stack_spec=mamba_stack_spec, - vocab_size=args.vocab_size, - max_sequence_length=args.max_position_embeddings, - pre_process=pre_process, - post_process=post_process, - hybrid_attention_ratio=args.hybrid_attention_ratio, - hybrid_mlp_ratio=args.hybrid_mlp_ratio, - hybrid_override_pattern=args.hybrid_override_pattern, - fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, - parallel_output=True, - share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, - position_embedding_type=args.position_embedding_type, - rotary_percent=args.rotary_percent, - ) - return model - - def create_test_args( - self, tp, cp, sequence_length, micro_batch_size, fp8=None, full_recompute=False - ): - destroy_global_vars() - destroy_num_microbatches_calculator() - - sys.argv = ['test_multi_token_prediction_mamba.py'] - args = parse_args() - args.num_layers = 4 - args.mtp_num_layers = 2 - args.mtp_loss_scaling_factor = 0.1 - args.vocab_size = 128800 - args.hidden_size = 128 - args.num_attention_heads = 8 - args.num_query_groups = 8 - args.mamba_num_groups = 4 - args.max_position_embeddings = 256 - args.micro_batch_size = micro_batch_size - args.create_attention_mask_in_dataloader = True - args.seq_length = sequence_length - args.tensor_model_parallel_size = tp - args.sequence_parallel = True if tp > 1 else False - args.context_parallel_size = cp - args.position_embedding_type = 'rope' - args.train_iters = 1 - args.ckpt_format = 'torch_dist' - args.lr = 3e-5 - args.attention_dropout = 0.0 - args.hidden_dropout = 0.0 - args.async_tensor_model_parallel_allreduce = False - args.no_save_optim = True - args.no_load_optim = True - args.no_load_rng = True - args.bf16 = True - args.hybrid_attention_ratio = 0.5 - args.hybrid_mlp_ratio = 0.0 - # Unified pattern: "main/mtp/mtp" - main decoder "M*M*", MTP pattern "M*" with 2 depths - args.hybrid_override_pattern = "M*M*/M*/M*" - args.spec = "megatron.core.models.mamba.mamba_layer_specs.mamba_stack_spec" - - if fp8 is not None: - args.fp8 = 'e4m3' - if full_recompute: - args.recompute_granularity = 'full' - args.recompute_method = 'uniform' - args.recompute_num_layers = 1 - else: - args.recompute_granularity = None - args.add_bias_linear = False - args.swiglu = True - - validate_args(args) - set_global_variables(args, False) - return args - - def get_batch(self, seq_length, micro_batch_size): - data = list(range(seq_length)) - input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - attention_mask = torch.ones( - (micro_batch_size, 1, seq_length, seq_length), dtype=bool - ).cuda() - loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() - batch = { - 'tokens': input_ids, - 'labels': labels, - 'loss_mask': loss_mask, - 'attention_mask': attention_mask, - 'position_ids': position_ids, - } - return batch - - @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") - @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) - def test_sharded_state_dict_mamba(self, tp, cp): - """Test MTP with Mamba hybrid model - sharded state dict.""" - args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) - set_args(args) - torch.manual_seed(_SEED) - Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) - mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) - mamba_model = unwrap_model(mamba_model) - sharded_state_dict = mamba_model[0].sharded_state_dict() - - # Verify MTP layers are in the state dict - for i in range(args.mtp_num_layers): - assert f"mtp.layers.{i}.enorm.weight" in sharded_state_dict.keys() - assert f"mtp.layers.{i}.hnorm.weight" in sharded_state_dict.keys() - assert f"mtp.layers.{i}.eh_proj.weight" in sharded_state_dict.keys() - - @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") - @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) - def test_forward_backward_mamba(self, tmp_path_dist_ckpt, tp, cp): - """Test MTP forward and backward with Mamba hybrid model.""" - tp_ref = 1 - cp_ref = 1 - args = self.create_test_args(tp_ref, cp_ref, self.seq_length, self.micro_batch_size) - set_args(args) - torch.manual_seed(_SEED) - Utils.initialize_model_parallel( - tensor_model_parallel_size=tp_ref, context_parallel_size=cp_ref - ) - batch = self.get_batch(self.seq_length, self.micro_batch_size) - tokens, labels, loss_mask, attention_mask, position_ids = batch.values() - - mamba_model_ref, optimizer, opt_param_scheduler = setup_model_and_optimizer( - self.model_provider, ModelType.encoder_or_decoder - ) - - output_ref = mamba_model_ref[0].forward( - input_ids=tokens, - position_ids=position_ids, - attention_mask=attention_mask, - labels=labels, - loss_mask=loss_mask, - ) - tracker = MTPLossLoggingHelper.tracker - mtp_loss_ref = None - assert "values" in tracker - mtp_loss_ref = tracker['values'].clone() - MTPLossLoggingHelper.clean_loss_in_tracker() - - iteration = 123 - num_floating_point_operations_so_far = 456 - - def set_ckpt_path(ckpt_path): - args.save = ckpt_path - args.load = ckpt_path - - with TempNamedDir(tmp_path_dist_ckpt / 'test_mtp_mamba_model_reconfiguration') as ckpt_dir: - set_ckpt_path(ckpt_dir) - save_checkpoint( - iteration, - mamba_model_ref, - optimizer, - opt_param_scheduler, - num_floating_point_operations_so_far, - ) - - expected_ckpt_path = args.save / "iter_0000123" / ".metadata" - assert os.path.exists(expected_ckpt_path) - - Utils.destroy_model_parallel() - args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) - set_args(args) - set_ckpt_path(ckpt_dir) - torch.manual_seed(_SEED) - Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) - mamba_model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - self.model_provider, ModelType.encoder_or_decoder - ) - load_checkpoint(mamba_model, optimizer, opt_param_scheduler, strict=False) - - batch["output_ref"] = output_ref - batch = get_batch_on_this_cp_rank(batch) - tokens, labels, loss_mask, attention_mask, position_ids, output_ref = batch.values() - output = mamba_model[0].forward( - input_ids=tokens, - position_ids=position_ids, - attention_mask=attention_mask, - labels=labels, - loss_mask=loss_mask, - ) - tracker = MTPLossLoggingHelper.tracker - assert "values" in tracker - mtp_loss = tracker['values'].clone() - pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['cp']) - torch.distributed.all_reduce( - mtp_loss, group=pg_collection.cp, op=torch.distributed.ReduceOp.AVG - ) - MTPLossLoggingHelper.clean_loss_in_tracker() - assert torch.allclose(output_ref, output, rtol=1e-03, atol=1e-03) - assert torch.allclose(mtp_loss, mtp_loss_ref, rtol=1e-02, atol=1e-02) - - assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length / cp - - loss = output.mean() - loss.backward() - for name, param in mamba_model[0].named_parameters(): - assert param.main_grad is not None - - @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") - def test_attention_mask_validation_mamba(self): - """Test that attention mask type validation works for Mamba hybrid models.""" - tp = 1 - cp = 1 - args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) - set_args(args) - torch.manual_seed(_SEED) - Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) - try: - mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) - mamba_model = unwrap_model(mamba_model) - assert isinstance(mamba_model[0], MambaModel) - assert mamba_model[0].mtp is not None - except AssertionError as e: - if "Multi-Token Prediction (MTP) is not yet supported" in str(e): - pytest.fail(f"Attention mask validation failed for Mamba hybrid model: {e}") - else: - raise From 50546da7d91249dde03d639e5908eb806416f2b6 Mon Sep 17 00:00:00 2001 From: Duncan Riach <33532941+duncanriach@users.noreply.github.com> Date: Mon, 2 Feb 2026 02:18:42 -0800 Subject: [PATCH 028/231] Fix bug in SFTDataset (#3185) Co-authored-by: Antoni-Joan Solergibert --- megatron/training/datasets/sft_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py index b313dafb0ec..fd9d1fe7c14 100644 --- a/megatron/training/datasets/sft_dataset.py +++ b/megatron/training/datasets/sft_dataset.py @@ -146,8 +146,8 @@ def extend_with_padding(tokens, targets, positions, pad_len): max_body = pack_length pack_tokens = pack_tokens[:max_body] pack_targets = pack_targets[:max_body] - pack_tokens.extend(pad) - pack_targets.extend(pad) + pack_tokens.append(pad) + pack_targets.append(pad) pack_positions = pack_positions[:pack_length+1] # Note len({pack_tokens, pack_targets, pack_positions}) should be pack_length + 1 cu_seqlens[-1] = len(pack_tokens) - 1 From dff41899373d057ec8721ba454ea225d92e5695f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Mon, 2 Feb 2026 15:24:56 +0200 Subject: [PATCH 029/231] Fix several syntax error (#3004) Signed-off-by: Hollow Man Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xin Yao --- examples/multimodal/evaluation/evaluate_mmmu.py | 3 ++- megatron/legacy/model/vision/esvit_swin_backbone.py | 6 +++--- tests/test_utils/python_scripts/launch_nemo_run_workload.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/multimodal/evaluation/evaluate_mmmu.py b/examples/multimodal/evaluation/evaluate_mmmu.py index 90cf141cd54..2bdc2ebbdc5 100644 --- a/examples/multimodal/evaluation/evaluate_mmmu.py +++ b/examples/multimodal/evaluation/evaluate_mmmu.py @@ -1,3 +1,4 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import argparse import glob import json @@ -104,7 +105,7 @@ def mmmu_eval(input_path, groundtruth_path): print(output.stderr) print(output.stdout) - m = re.search("'Overall': {'num': \d+, 'acc': (\d.\d+)}", output.stdout) + m = re.search(r"'Overall': {'num': \d+, 'acc': (\d\.\d+)}", output.stdout) return float(m.group(1)) * 100.0 diff --git a/megatron/legacy/model/vision/esvit_swin_backbone.py b/megatron/legacy/model/vision/esvit_swin_backbone.py index 87932040cb7..c7c67b40c64 100644 --- a/megatron/legacy/model/vision/esvit_swin_backbone.py +++ b/megatron/legacy/model/vision/esvit_swin_backbone.py @@ -726,7 +726,7 @@ def init_weights(self, pretrained='', pretrained_layers=[], verbose=True): for k, v in pretrained_dict.items(): need_init = ( k.split('.')[0] in pretrained_layers - or pretrained_layers[0] is '*' + or pretrained_layers[0] == '*' or 'relative_position_index' not in k or 'attn_mask' not in k ) @@ -785,7 +785,7 @@ def freeze_pretrained_layers(self, frozen_layers=[]): if ( name.split('.')[0] in frozen_layers or '.'.join(name.split('.')[0:2]) in frozen_layers - or (len(frozen_layers) > 0 and frozen_layers[0] is '*') + or (len(frozen_layers) > 0 and frozen_layers[0] == '*') ): for _name, param in module.named_parameters(): param.requires_grad = False @@ -796,7 +796,7 @@ def freeze_pretrained_layers(self, frozen_layers=[]): for name, param in self.named_parameters(): if ( name.split('.')[0] in frozen_layers - or (len(frozen_layers) > 0 and frozen_layers[0] is '*') + or (len(frozen_layers) > 0 and frozen_layers[0] == '*') and param.requires_grad is True ): param.requires_grad = False diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index 8d006f70d19..bf4bb37aa20 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -140,10 +140,10 @@ def main( succeeded = str(job_dict["status"]) == "SUCCEEDED" if succeeded: - logger.info(f"Job succeeded with status: {job_dict["status"]}") + logger.info(f"Job succeeded with status: {job_dict['status']}") sys.exit(0) - logger.error(f"Job failed with status: {job_dict["status"]}") + logger.error(f"Job failed with status: {job_dict['status']}") log_file_paths = pathlib.Path(os.getcwd()).glob("assets_dir/logs/*/*/attempt_0/*/std*.log") all_ranks_all_logs = [] for log_file_path in log_file_paths: From c4bea0a96a72f26db7ddc584bd6131688e00b6c3 Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:28:36 -0500 Subject: [PATCH 030/231] Fix for RL Test (#3148) Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- .../data_parallel_inference_coordinator.py | 15 +- .../core/inference/engines/dynamic_engine.py | 2 + megatron/rl/rl_utils.py | 4 + .../test_grpo_training_loop.py | 5 +- .../golden_values_dev_dgx_h100.json | 320 +++++++++--------- .../model_config.yaml | 8 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- tests/test_utils/recipes/h100/gpt-grpo.yaml | 2 +- 10 files changed, 200 insertions(+), 165 deletions(-) diff --git a/megatron/core/inference/data_parallel_inference_coordinator.py b/megatron/core/inference/data_parallel_inference_coordinator.py index 9a1a11a8c2b..93e30f6aa25 100644 --- a/megatron/core/inference/data_parallel_inference_coordinator.py +++ b/megatron/core/inference/data_parallel_inference_coordinator.py @@ -74,6 +74,7 @@ def __init__( data_parallel_size: int, tokenizer, inference_coordinator_port: int | None = None, + deterministic_mode: bool = False, ): """ Initializes the inference coordinator. @@ -145,6 +146,12 @@ def __init__( assert identity not in self.identities_of_data_parallel_ranks self.identities_of_data_parallel_ranks.append(identity) logging.info("Inference Coordinator: Connected with data parallel ranks...") + + # In deterministic mode, sort identities for consistent scheduling order. + if deterministic_mode: + self.identities_of_data_parallel_ranks = deque( + sorted(self.identities_of_data_parallel_ranks) + ) self.data_parallel_rank_iterator = cycle(self.identities_of_data_parallel_ranks) self.data_parallel_pause_acks = set() self.data_parallel_stop_acks = set() @@ -343,6 +350,7 @@ def entrypoint( data_parallel_size: int, tokenizer, inference_coordinator_port: int | None = None, + deterministic_mode: bool = False, ): """ Class method to instantiate and run the coordinator, for use in a separate process. @@ -356,9 +364,14 @@ def entrypoint( once the coordinator is ready to accept connections. inference_coordinator_port (int): The port to bind to. data_parallel_size (int): The number of expected TP-coordinators. + deterministic_mode (bool): Whether to enable deterministic scheduling. """ coordinator = cls( - pipe_connection, data_parallel_size, tokenizer, inference_coordinator_port + pipe_connection, + data_parallel_size, + tokenizer, + inference_coordinator_port, + deterministic_mode=deterministic_mode, ) ready_event.set() try: diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 0a95e8f4a53..a09d15ae20b 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -424,6 +424,7 @@ async def start_listening_to_data_parallel_coordinator( # Spawn a DP coordinator process and get the connection info. if launch_inference_coordinator and self.is_dp_coordinator: spawn_context = multiprocessing.get_context('spawn') + deterministic_mode = torch.are_deterministic_algorithms_enabled() dp_pipe, dp_process_pipe = spawn_context.Pipe() coordinator_ready_event = spawn_context.Event() self.inference_coordinator_process = spawn_context.Process( @@ -434,6 +435,7 @@ async def start_listening_to_data_parallel_coordinator( get_pg_size(self.pg_collection.dp), self.controller.tokenizer, inference_coordinator_port, + deterministic_mode, ), ) self.inference_coordinator_process.start() diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 364a80db81e..3058db78f41 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -518,6 +518,10 @@ def get_environment_rollouts( rollouts = [ loop.run_until_complete(anext(rollout_generator)) for _ in range(n_prompts) ] + # In deterministic mode, sort rollouts by problem_id for consistent ordering + # regardless of completion order due to system timing jitter. + if torch.are_deterministic_algorithms_enabled(): + rollouts.sort(key=lambda group: group[0].problem_id if group and group[0].problem_id else "") if not args.rl_partial_rollouts: while True: try: diff --git a/tests/functional_tests/python_test_utils/test_grpo_training_loop.py b/tests/functional_tests/python_test_utils/test_grpo_training_loop.py index 6faca9b11b3..b4447f5f761 100644 --- a/tests/functional_tests/python_test_utils/test_grpo_training_loop.py +++ b/tests/functional_tests/python_test_utils/test_grpo_training_loop.py @@ -84,9 +84,8 @@ def test_grpo_training_loop( with open(model_config_path, 'r') as f: model_config = yaml.safe_load(f) metrics = model_config["METRICS"] - if "THROUGHPUT_TEST_PARAMS" in model_config: - throughput_test_params = model_config["THROUGHPUT_TEST_PARAMS"] - start_step = throughput_test_params["--start_step"] + if "ENV_VARS" in model_config and "THROUGHPUT_START_STEP" in model_config["ENV_VARS"]: + start_step = model_config["ENV_VARS"]["THROUGHPUT_START_STEP"] else: start_step = 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json index d985f671cab..05bc35e362f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json @@ -1,173 +1,187 @@ { + "lm loss": { + "start_step": 1, + "end_step": 30, + "step_interval": 1, + "values": { + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.04559, + "5": 0.0, + "6": 0.0523, + "7": 0.0, + "8": 0.0, + "9": 0.04887, + "10": 0.0, + "11": 0.0, + "12": 0.0, + "13": 0.0, + "14": 0.0, + "15": 0.0, + "16": 0.0, + "17": 0.04299, + "18": 0.0, + "19": 0.03797, + "20": 0.0, + "21": 0.0, + "22": 0.0, + "23": 0.0, + "24": 0.0, + "25": 0.0, + "26": 0.0, + "27": 0.0, + "28": 0.0, + "29": 0.0, + "30": 0.0 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 30, + "step_interval": 1, + "values": { + "1": 583687296.0, + "2": 583687296.0, + "3": 583687296.0, + "4": 31.0, + "5": 583687296.0, + "6": 12.0, + "7": 583687296.0, + "8": 583687296.0, + "9": 16.0, + "10": 583687296.0, + "11": 583687296.0, + "12": 583687296.0, + "13": 583687296.0, + "14": 583687296.0, + "15": 583687296.0, + "16": 583687296.0, + "17": 47.0, + "18": 583687296.0, + "19": 43.0, + "20": 583687296.0, + "21": 583687296.0, + "22": 583687296.0, + "23": 583687296.0, + "24": 583687296.0, + "25": 583687296.0, + "26": 583687296.0, + "27": 583687296.0, + "28": 583687296.0, + "29": 583687296.0, + "30": 583687296.0 + } + }, "mem-allocated-bytes": { "start_step": 1, - "end_step": 50, + "end_step": 30, "step_interval": 1, "values": { - "1": 48967716864.0, - "2": 48973631488.0, - "3": 48974528512.0, - "4": 48971538432.0, - "5": 48974340096.0, - "6": 48974143488.0, - "7": 48977002496.0, - "8": 48975851520.0, - "9": 48974036992.0, - "10": 48973709312.0, - "11": 48973262848.0, - "12": 48973705216.0, - "13": 48973598720.0, - "14": 48976703488.0, - "15": 48975118336.0, - "16": 48977072128.0, - "17": 48976465920.0, - "18": 48976470016.0, - "19": 48976478208.0, - "20": 48976654336.0, - "21": 48976793600.0, - "22": 48976052224.0, - "23": 48976277504.0, - "24": 48974708736.0, - "25": 48973062144.0, - "26": 48976236544.0, - "27": 48975970304.0, - "28": 48976711680.0, - "29": 48975593472.0, - "30": 48977321984.0, - "31": 48977506304.0, - "32": 48976646144.0, - "33": 48976072704.0, - "34": 48973631488.0, - "35": 48976650240.0, - "36": 48975650816.0, - "37": 48974950400.0, - "38": 48972750848.0, - "39": 48976617472.0, - "40": 48979308544.0, - "41": 48978587648.0, - "42": 48975626240.0, - "43": 48975089664.0, - "44": 48973688832.0, - "45": 48975327232.0, - "46": 48975159296.0, - "47": 48975372288.0, - "48": 48973856768.0, - "49": 48973377536.0, - "50": 48975568896.0 + "1": 48985034752.0, + "2": 48991363072.0, + "3": 48993005568.0, + "4": 48991928320.0, + "5": 48992874496.0, + "6": 48991891456.0, + "7": 48991338496.0, + "8": 48993873920.0, + "9": 48993124352.0, + "10": 48994115584.0, + "11": 48994050048.0, + "12": 48993181696.0, + "13": 48993918976.0, + "14": 48992014336.0, + "15": 48992256000.0, + "16": 48989933568.0, + "17": 48992645120.0, + "18": 48992890880.0, + "19": 48992821248.0, + "20": 48992821248.0, + "21": 48991612928.0, + "22": 48993181696.0, + "23": 48992821248.0, + "24": 48992821248.0, + "25": 48993931264.0, + "26": 48992022528.0, + "27": 48993173504.0, + "28": 48992821248.0, + "29": 48993935360.0, + "30": 48994017280.0 } }, "mem-max-allocated-bytes": { "start_step": 1, - "end_step": 50, + "end_step": 30, "step_interval": 1, "values": { - "1": 49090379776.0, - "2": 49937022976.0, - "3": 49938366464.0, - "4": 49938366464.0, - "5": 49938366464.0, - "6": 49938698240.0, - "7": 49939156992.0, - "8": 49939156992.0, - "9": 49939156992.0, - "10": 49939156992.0, - "11": 49939156992.0, - "12": 49939156992.0, - "13": 49939156992.0, - "14": 49940287488.0, - "15": 49940287488.0, - "16": 49940287488.0, - "17": 49941729280.0, - "18": 49941733376.0, - "19": 49941741568.0, - "20": 49941778432.0, - "21": 49941778432.0, - "22": 49941778432.0, - "23": 49941778432.0, - "24": 49941778432.0, - "25": 49941778432.0, - "26": 49941778432.0, - "27": 49941934080.0, - "28": 49941934080.0, - "29": 49941934080.0, - "30": 49941934080.0, - "31": 49942675456.0, - "32": 49942675456.0, - "33": 49942675456.0, - "34": 49942675456.0, - "35": 49942675456.0, - "36": 49942675456.0, - "37": 49942675456.0, - "38": 49942675456.0, - "39": 49942675456.0, - "40": 49944379392.0, - "41": 49944379392.0, - "42": 49944379392.0, - "43": 49944379392.0, - "44": 49944379392.0, - "45": 49944379392.0, - "46": 49944379392.0, - "47": 49944379392.0, - "48": 49944379392.0, - "49": 49944379392.0, - "50": 49944379392.0 + "1": 49104257024.0, + "2": 49953497088.0, + "3": 49955368960.0, + "4": 49955368960.0, + "5": 49955368960.0, + "6": 49955368960.0, + "7": 49955368960.0, + "8": 49955745792.0, + "9": 49955745792.0, + "10": 49957498880.0, + "11": 49957838848.0, + "12": 49957838848.0, + "13": 49957838848.0, + "14": 49957838848.0, + "15": 49957838848.0, + "16": 49957838848.0, + "17": 49957838848.0, + "18": 49957838848.0, + "19": 49957838848.0, + "20": 49957838848.0, + "21": 49957838848.0, + "22": 49957838848.0, + "23": 49957838848.0, + "24": 49957838848.0, + "25": 49957838848.0, + "26": 49957838848.0, + "27": 49957838848.0, + "28": 49957838848.0, + "29": 49957838848.0, + "30": 49957838848.0 } }, "iteration-time": { "start_step": 1, - "end_step": 50, + "end_step": 30, "step_interval": 1, "values": { - "1": 63.07516, - "2": 4.36236, - "3": 3.83222, - "4": 3.85784, - "5": 3.74494, - "6": 3.82661, - "7": 4.05458, - "8": 3.76622, - "9": 3.90518, - "10": 4.09283, - "11": 3.96358, - "12": 3.85778, - "13": 3.84546, - "14": 3.85497, - "15": 4.35749, - "16": 3.7861, - "17": 3.8896, - "18": 3.6267, - "19": 3.76463, - "20": 3.6953, - "21": 3.63427, - "22": 3.66652, - "23": 3.60379, - "24": 3.57701, - "25": 3.57327, - "26": 3.71371, - "27": 3.69626, - "28": 3.89285, - "29": 3.62405, - "30": 3.58297, - "31": 3.56993, - "32": 3.75257, - "33": 3.72279, - "34": 3.48095, - "35": 3.60831, - "36": 3.74971, - "37": 3.72155, - "38": 3.51054, - "39": 3.64562, - "40": 3.66038, - "41": 3.86018, - "42": 3.58341, - "43": 3.82647, - "44": 3.85728, - "45": 3.62416, - "46": 3.59141, - "47": 3.74512, - "48": 3.61762, - "49": 3.57079, - "50": 3.66209 + "1": "nan", + "2": 54.85374, + "3": 4.04314, + "4": 3.83505, + "5": 4.00853, + "6": 3.71939, + "7": 3.66436, + "8": 4.07479, + "9": 3.90049, + "10": 4.34491, + "11": 3.98659, + "12": 3.90765, + "13": 4.12679, + "14": 3.75558, + "15": 3.72381, + "16": 3.45749, + "17": 3.73387, + "18": 3.71406, + "19": 3.75517, + "20": 3.94287, + "21": 3.88534, + "22": 3.86744, + "23": 3.87809, + "24": 3.86352, + "25": 3.87829, + "26": 3.76391, + "27": 3.76762, + "28": 3.96514, + "29": 3.92952, + "30": 3.87378 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml index b74417a898b..b5788d64049 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml @@ -1,5 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 + THROUGHPUT_START_STEP: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 @@ -70,7 +71,7 @@ MODEL_ARGS: --dist-ckpt-strictness: log_unexpected --perform-rl-step: true --train-samples: 48828125 - --exit-interval: 50 + --exit-interval: 30 --tensorboard-dir: ${TENSORBOARD_PATH} --save-interval: 1000000 --eval-interval: 1000000 @@ -79,6 +80,11 @@ MODEL_ARGS: --rl-inference-tensor-model-parallel-size: 1 --rl-inference-pipeline-model-parallel-size: 2 --refit-method: gloo + --deterministic-mode: true METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" - "mem-allocated-bytes" - "mem-max-allocated-bytes" + diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml index 3037e2e0803..ada0350b876 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml @@ -1,5 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 + THROUGHPUT_START_STEP: 10 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 @@ -99,5 +100,3 @@ METRICS: - "mem-allocated-bytes" - "mem-max-allocated-bytes" - "iteration-time" -THROUGHPUT_TEST_PARAMS: - --start_step: 10 diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml index 456280fdb04..4490ced3988 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml @@ -1,5 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 + THROUGHPUT_START_STEP: 10 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 @@ -99,5 +100,3 @@ METRICS: - "mem-allocated-bytes" - "mem-max-allocated-bytes" - "iteration-time" -THROUGHPUT_TEST_PARAMS: - --start_step: 10 diff --git a/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml index ed5d123892e..c7dcfa594d8 100644 --- a/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml @@ -1,5 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 + THROUGHPUT_START_STEP: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 @@ -137,5 +138,3 @@ METRICS: - "mem-allocated-bytes" - "mem-max-allocated-bytes" -THROUGHPUT_TEST_PARAMS: - --start_step: 1 diff --git a/tests/test_utils/recipes/h100/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml index e707c1c2431..faaccee73dd 100644 --- a/tests/test_utils/recipes/h100/gpt-grpo.yaml +++ b/tests/test_utils/recipes/h100/gpt-grpo.yaml @@ -77,5 +77,5 @@ products: - test_case: [gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest] products: - environment: [dev] - scope: [mr-broken] + scope: [mr] platforms: [dgx_h100] From a4008d0f2f807202217204a61d26f36731450f08 Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Tue, 3 Feb 2026 09:56:29 +0800 Subject: [PATCH 031/231] Fix latent moe flops and backward_dw (#2977) Signed-off-by: Robin Zhang --- megatron/core/transformer/moe/moe_layer.py | 19 +++++++++++++++++-- megatron/training/training.py | 16 +++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 5cfea1e8ae4..aa5f9658df4 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -451,10 +451,25 @@ def custom_forward(hidden_states, intermediate_tensors=None, padding_mask=None): def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False): """Compute weight gradients for experts and shared experts.""" + # TODO(Wohox): replace the "routed_experts" and "shared_experts" arguments with better + # naming to better explain that they are actually from different fine-grained callables, + # or use scanning to decide which backward_dw should be called. if routed_experts: self.experts.backward_dw() - if shared_experts and self.use_shared_expert and not self.shared_expert_overlap: - self.shared_experts.backward_dw() + if self.config.moe_latent_size: + # TODO(Wohox): fc2_latent_proj forward and backward are executed in comm stream, + # so we execute its backward_dw in the comm stream too. But this may harm the + # EP overlap performance. Better to check if there is a better way to handle this. + from megatron.core.pipeline_parallel.utils import get_comm_stream + + comm_stream = get_comm_stream() + with torch.cuda.stream(comm_stream): + self.fc2_latent_proj.backward_dw() + if shared_experts: + if self.use_shared_expert and not self.shared_expert_overlap: + self.shared_experts.backward_dw() + if self.config.moe_latent_size: + self.fc1_latent_proj.backward_dw() def set_for_recompute_pre_mlp_layernorm(self): """Set the MoE layer for recompute pre_mlp_layernorm. Only needed for fp8/fp4.""" diff --git a/megatron/training/training.py b/megatron/training/training.py index 87d9fe8b841..7c394c7b266 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -364,6 +364,7 @@ def transformer_flops(): if args.moe_ffn_hidden_size is not None else args.ffn_hidden_size ) + moe_latent_size = args.moe_latent_size shared_expert_ffn_hidden_size = ( 0 if args.moe_shared_expert_intermediate_size is None @@ -545,7 +546,20 @@ def transformer_flops(): (args.ffn_hidden_size * ffn_expansion_factor) * num_dense_layers # routed experts - + (moe_ffn_hidden_size * num_experts_routed_to * ffn_expansion_factor) + + ( + (moe_ffn_hidden_size * num_experts_routed_to * ffn_expansion_factor) + if moe_latent_size is None + else ( + ( + moe_ffn_hidden_size + * num_experts_routed_to + * ffn_expansion_factor + * moe_latent_size + / args.hidden_size + ) # Routed experts run on moe_latent_size. + + 2 * moe_latent_size # Up proj and down proj. + ) + ) * num_moe_layers # Shared Experts. + (shared_expert_ffn_hidden_size * ffn_expansion_factor) From afe443bc4254762e4d91031bbfd074b6ba531d15 Mon Sep 17 00:00:00 2001 From: Sheng Fu Date: Mon, 2 Feb 2026 18:38:10 -0800 Subject: [PATCH 032/231] Use global user buffer when the bucket size does not fit FixedPoolAllocator (#2857) --- .../distributed_data_parallel_config.py | 11 ++- .../distributed_data_parallel_config.py | 9 ++ .../fsdp/src/megatron_fsdp/fully_shard.py | 8 ++ .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 5 + .../megatron_fsdp/param_and_grad_buffer.py | 92 +++++++++++++------ 5 files changed, 98 insertions(+), 27 deletions(-) diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py index eaec971c79c..c4b25b9f85c 100644 --- a/megatron/core/distributed/distributed_data_parallel_config.py +++ b/megatron/core/distributed/distributed_data_parallel_config.py @@ -122,7 +122,16 @@ class DistributedDataParallelConfig: This option will cause additional memory overhead, however, it is necessary for to register user buffer (nccl_ub=True) for the Megatron FSDP. This option will be automatically set to True when nccl_ub=True. - """ + """ + + fsdp_db_use_persist_buf_on_alloc_fail: bool = False + """Whether to fall back to persistent buffer when a bucket does not + fit FSDP double buffer size. If true, FSDP will use the persistently + allocated buffer for the bucket that does not fit, it will enable NCCL + user buffer with the cost of more memory usage. If false, FSDP will use + Dynamic memory allocator, NCCL user buffer won't not enabled, which + usually leads to low performance. + """ outer_dp_sharding_strategy: str = 'no_shard' """ diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py index f0c817e1f80..e8d3c990332 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py @@ -119,6 +119,15 @@ class DistributedDataParallelConfig: This option will be automatically set to True when nccl_ub=True. """ + fsdp_db_use_persist_buf_on_alloc_fail: bool = False + """Whether to fall back to persistent buffer when a bucket does not + fit FSDP double buffer size. If true, FSDP will use the persistently + allocated buffer for the bucket that does not fit, it will enable NCCL + user buffer with the cost of more memory usage. If false, FSDP will use + Dynamic memory allocator, NCCL user buffer won't not enabled, which + usually leads to low performance. + """ + outer_dp_sharding_strategy: str = 'no_shard' """ Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py index df210f15f05..bcfbefcbd3b 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py @@ -96,6 +96,7 @@ def fully_shard_model( keep_fp8_transpose_cache: bool = False, nccl_ub: bool = False, fsdp_double_buffer: bool = False, + fsdp_db_use_persist_buf_on_alloc_fail: bool = False, disable_symmetric_registration: bool = False, enable_fine_grained_param_gather: bool = False, ) -> torch.nn.Module: @@ -230,6 +231,10 @@ class that schedules the sharding lifecycle of the model parameters and gradient fsdp_double_buffer (bool): Whether to use double buffer for FSDP. Defaults to False. + fsdp_db_use_persist_buf_on_alloc_fail (bool): + Whether to fall back to persistent buffer allocator when a bucket does not + fit FSDP double buffer size. + disable_symmetric_registration (bool): Whether to disable symmetric (window) registration for NCCL UB registration. This option forces conventional (local) UB registration when nccl_ub is set. @@ -335,6 +340,7 @@ class that schedules the sharding lifecycle of the model parameters and gradient keep_fp8_transpose_cache=keep_fp8_transpose_cache, # pylint: disable=C0301 nccl_ub=nccl_ub, fsdp_double_buffer=fsdp_double_buffer or nccl_ub, + fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail, disable_symmetric_registration=disable_symmetric_registration, check_for_nan_in_grad=check_for_nan_in_grad, ) @@ -540,6 +546,7 @@ def fully_shard( keep_fp8_transpose_cache: bool = False, nccl_ub: bool = False, fsdp_double_buffer: bool = False, + fsdp_db_use_persist_buf_on_alloc_fail: bool = False, disable_symmetric_registration: bool = False, enable_fine_grained_param_gather: bool = False, ) -> tuple[MegatronFSDP, torch.optim.Optimizer]: @@ -587,6 +594,7 @@ def fully_shard( keep_fp8_transpose_cache=keep_fp8_transpose_cache, nccl_ub=nccl_ub, fsdp_double_buffer=fsdp_double_buffer, + fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail, disable_symmetric_registration=disable_symmetric_registration, enable_fine_grained_param_gather=enable_fine_grained_param_gather, ) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index bd13e76379e..03bb4d0dcfe 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -136,6 +136,8 @@ class MegatronFSDP(torch.nn.Module): fsdp_double_buffer (bool): Whether to use persistently allocated double buffers for the temporary memory needed in the FSDP communication. This flag is automatically set to True when nccl_ub is True. + fsdp_db_use_persist_buf_on_alloc_fail (bool): Whether to fall back to persistent buffer + allocator when a bucket does not fit FSDP double buffer size. disable_symmetric_registration (bool): Whether to disable symmetric (window) registration for NCCL userbuffer registration. This option will force to use conventional (local) userbuffer registration when nccl_ub is set. @@ -155,6 +157,7 @@ class MegatronFSDP(torch.nn.Module): ... keep_fp8_transpose_cache=False, ... nccl_ub=False, ... fsdp_double_buffer=False, + ... fsdp_db_use_persist_buf_on_alloc_fail=False, ... disable_symmetric_registration=False, ... ) """ @@ -173,6 +176,7 @@ def __init__( keep_fp8_transpose_cache: bool = False, nccl_ub: bool = False, fsdp_double_buffer: bool = False, + fsdp_db_use_persist_buf_on_alloc_fail: bool = False, disable_symmetric_registration: bool = False, enable_fine_grained_param_gather_hook: bool = False, ): @@ -217,6 +221,7 @@ def __init__( keep_fp8_transpose_cache=keep_fp8_transpose_cache, # pylint: disable=C0301 nccl_ub=nccl_ub, fsdp_double_buffer=fsdp_double_buffer or nccl_ub, + fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail, disable_symmetric_registration=disable_symmetric_registration, ) else: diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index 3ec117ebd9e..fe25026b22e 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -658,7 +658,13 @@ class FixedPoolAllocator(TemporaryBucketAllocator): deallocation of temporary buffers during FSDP operations. """ - def __init__(self, name: str, fsdp_param_groups: List["ParameterGroup"], size: int = 2): + def __init__( + self, + name: str, + fsdp_param_groups: List["ParameterGroup"], + size: int = 2, + fallback_to_persistent_buffer: bool = False, + ): self.name = name self.fsdp_param_groups = fsdp_param_groups self.size = size # Number of buffers in the pool (default is 2 for double buffering) @@ -691,6 +697,29 @@ def __init__(self, name: str, fsdp_param_groups: List["ParameterGroup"], size: i ), "Found no FSDP units to use fixed-size buffering" self.fsdp_double_buffer_units = fsdp_units_to_double_buffer + if torch.distributed.get_rank() == 0: + for bucket_id, param_group in enumerate(fsdp_param_groups): + if ( + param_group.fsdp_unit_id == -1 + or param_group.fsdp_unit_id is None + or param_group.fsdp_unit_id not in self.fsdp_double_buffer_units + ): + logging.info( + f"FSDP unit (id={param_group.fsdp_unit_id}) does not fit " + "in FixedPoolAllcator" + ) + if fallback_to_persistent_buffer is False: + logging.info( + "It will fall back to dynamic memory allocator, NCCL user " + "buffer is not supported" + ) + else: + logging.info( + "It will be allocated a persistent buffer. If the memory " + "budget is tight, set " + "trainer.strategy.ddp.fsdp_db_use_persist_buf_on_alloc_fail to False." + ) + # Initialize buffer group status. # Each buffer group represents a set of buffers associated with an FSDP unit's bucket group. self.idle_buffer = [] # List of available (buf_group_id, offset) tuples. @@ -703,6 +732,7 @@ def __init__(self, name: str, fsdp_param_groups: List["ParameterGroup"], size: i self.idle_buffer.append((buf_group_id, bucket_offset)) # Fallback allocator used if the fixed pool allocator cannot fulfill a request. + self.fallback_to_persistent_buffer = fallback_to_persistent_buffer self.backup_allocator = TemporaryBucketAllocator() def _is_two_bucket_group_equal(self, group_a, group_b): @@ -755,28 +785,31 @@ def allocate( f"current using_buffer: {self.using_buffer} \n" f"current idle_buffer: {self.idle_buffer}" ) - # Synchronization is required before the allocation for the user buffer - if mem_alloc_context is not None and mem_alloc_context != nullcontext: - # Check if a new buffer allocation is required - if ( - self.allocation_tracker.get((buffer_name, dtype), None) is None - or self.allocation_tracker[(buffer_name, dtype)] < size - ): - # Requires synchronization for new buffer allocation - self.allocation_tracker[(buffer_name, dtype)] = size - torch.cuda.synchronize() - return Bucket( - data=get_global_memory_buffer().get_tensor( - [size], dtype=dtype, name=buffer_name, mem_alloc_context=mem_alloc_context - ) + elif self.fallback_to_persistent_buffer is True: + buffer_name = f"{self.name}_not_fit_in_fixed_pool_{bucket_id}_{size}_{dtype}_{device}" + else: + # If the bucket is not eligible for fixed pool buffering, or no buffer is available, + # fall back to dynamic allocation via the backup allocator. This means that we + # will do dynamic memory allocation. + logging.debug(f"[FSDP] Using backup allocator for {bucket_id} {fsdp_unit_id}") + return self.backup_allocator.allocate( + bucket_id=bucket_id, size=size, dtype=dtype, device=device ) - # If the bucket is not eligible for fixed pool buffering, or no buffer is available, - # fall back to dynamic allocation via the backup allocator. This means that we - # will do dynamic memory allocation. - logging.debug(f"[FSDP] Using backup allocator for {bucket_id} {fsdp_unit_id}") - return self.backup_allocator.allocate( - bucket_id=bucket_id, size=size, dtype=dtype, device=device + # Use buffer_name to get memory from global memory. + if mem_alloc_context is not None and mem_alloc_context != nullcontext: + # Check if a new buffer allocation is required + if ( + self.allocation_tracker.get((buffer_name, dtype), None) is None + or self.allocation_tracker[(buffer_name, dtype)] < size + ): + # Requires synchronization for new buffer allocation + self.allocation_tracker[(buffer_name, dtype)] = size + torch.cuda.synchronize() + return Bucket( + data=get_global_memory_buffer().get_tensor( + [size], dtype=dtype, name=buffer_name, mem_alloc_context=mem_alloc_context + ) ) def _get_gbuf_name(self, buf_group_id: int, bucket_index: int): @@ -795,9 +828,10 @@ def free(self, bucket_id: int): self.idle_buffer.append(self.using_buffer[bucket_id]) del self.using_buffer[bucket_id] return - # If not managed by fixed pool allocator, delegate to the backup allocator. - logging.debug(f"[FSDP] Free from the backup allocator for {bucket_id} {fsdp_unit_id}") - self.backup_allocator.free(bucket_id) + if self.fallback_to_persistent_buffer is False: + # If not managed by fixed pool allocator, delegate to the backup allocator. + logging.debug(f"[FSDP] Free from the backup allocator for {bucket_id} {fsdp_unit_id}") + self.backup_allocator.free(bucket_id) class DataParallelBuffer: @@ -1874,7 +1908,10 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): if self.ddp_config.fsdp_double_buffer and len(self.bucketing_policy.fsdp_unit_modules) > 0: UB_BUFFER_NUM = 2 self.weight_alloc = FixedPoolAllocator( - name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM + name="fsdp_params", + fsdp_param_groups=self.parameter_groups, + size=UB_BUFFER_NUM, + fallback_to_persistent_buffer=self.ddp_config.fsdp_db_use_persist_buf_on_alloc_fail, ) self.transpose_weight_alloc = FixedPoolAllocator( name="fsdp_fp8_transpose_params", @@ -1882,7 +1919,10 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): size=UB_BUFFER_NUM, ) self.main_grad_alloc = FixedPoolAllocator( - name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM + name="fsdp_grads", + fsdp_param_groups=self.parameter_groups, + size=UB_BUFFER_NUM, + fallback_to_persistent_buffer=self.ddp_config.fsdp_db_use_persist_buf_on_alloc_fail, ) self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units else: From 78475fe309e516560f7699c739cef1e61e20e501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 3 Feb 2026 10:58:28 +0100 Subject: [PATCH 033/231] ci: Checkpoint retention (#3205) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml | 3 ++- .../model_config.yaml | 2 +- .../model_config.yaml | 4 ++-- .../model_config.yaml | 2 +- .../model_config.yaml | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml index 87a4fccb347..4c05b0ba87f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml @@ -70,7 +70,8 @@ MODEL_ARGS: # Add checkpointing args --load: ${CHECKPOINT_LOAD_PATH} --save: ${CHECKPOINT_SAVE_PATH} - --save-interval: 1000 + --save-interval: 5000 + --save-retain-interval: 2500 # Add initialization args --init-method-std: 0.0134 # Add logging args diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml index ced409e5b1e..d169e050402 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml @@ -134,7 +134,7 @@ MODEL_ARGS: # Add checkpointing args --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 500 + --save-interval: 5000 --save-retain-interval: 10000 --dist-ckpt-strictness: log_all diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml index 1ad8597d932..2ac08d088a0 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml @@ -134,8 +134,8 @@ MODEL_ARGS: # Add checkpointing args --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 500 - --save-retain-interval: 10000 + --save-interval: 5000 + --save-retain-interval: 2500 --dist-ckpt-strictness: log_all # Add initialization args diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml index 7bc14780fb3..3c7c4201b6e 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml @@ -132,7 +132,7 @@ MODEL_ARGS: # Add checkpointing args --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 500 + --save-interval: 5000 --save-retain-interval: 10000 --dist-ckpt-strictness: log_all diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml index cc8f2b814c2..fead6c06ae1 100644 --- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml @@ -132,8 +132,8 @@ MODEL_ARGS: # Add checkpointing args --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 500 - --save-retain-interval: 10000 + --save-interval: 5000 + --save-retain-interval: 2500 --dist-ckpt-strictness: log_all # Add initialization args From 708069774569517cc1802e6c339730a689179d4d Mon Sep 17 00:00:00 2001 From: Venmugil Elango <498703+venmugil@users.noreply.github.com> Date: Tue, 3 Feb 2026 02:18:32 -0800 Subject: [PATCH 034/231] Add unit test for LatentMoE (#2892) Co-authored-by: Xin Yao --- .../transformer/moe/test_latent_moe_layer.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 tests/unit_tests/transformer/moe/test_latent_moe_layer.py diff --git a/tests/unit_tests/transformer/moe/test_latent_moe_layer.py b/tests/unit_tests/transformer/moe/test_latent_moe_layer.py new file mode 100644 index 00000000000..f62de67860a --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_latent_moe_layer.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch + +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.moe.moe_layer import MoELayer +from megatron.core.transformer.moe.router import Router +from megatron.core.transformer.transformer_block import TransformerBlock +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import is_te_min_version +from megatron.training.initialize import _set_random_seed +from tests.unit_tests.test_utilities import Utils + + +class TestLatentMoELayer: + def setup_method(self, method): + pass + + @pytest.mark.skipif( + not is_te_min_version("1.7.0.dev0"), + reason="Expert with TE Linear is only supported in TE 1.7.0 and later.", + ) + @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"]) + @pytest.mark.parametrize("num_moe_experts", [4]) + @pytest.mark.parametrize("use_te,grouped_gemm", [(True, True), (True, False), (False, False)]) + @pytest.mark.parametrize("moe_latent_size", [8, 16]) + def test_latent_moe_layer( + self, num_moe_experts, moe_token_dispatcher_type, use_te, grouped_gemm, moe_latent_size + ): + Utils.initialize_model_parallel(1, 1) + _set_random_seed(seed_=123, data_parallel_random_init=False) + self.transformer_config = TransformerConfig( + num_layers=1, + hidden_size=32, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=True, + moe_token_dispatcher_type=moe_token_dispatcher_type, + moe_router_topk=2, + moe_aux_loss_coeff=0.01, + moe_grouped_gemm=grouped_gemm, + moe_ffn_hidden_size=128, + moe_shared_expert_intermediate_size=128, + activation_func=torch.nn.functional.silu, + gated_linear_unit=True, + add_bias_linear=False, + moe_latent_size=moe_latent_size, + ) + if use_te: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, moe_grouped_gemm=grouped_gemm + ) + else: + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=num_moe_experts, moe_grouped_gemm=grouped_gemm + ) + moe_layer = MoELayer( + self.transformer_config, transformer_layer_spec.submodules.mlp.submodules + ) + moe_layer.cuda() + config = moe_layer.config + + assert ( + moe_layer.shared_experts.linear_fc1.weight.shape[1] == config.hidden_size + ), "Shared expert computation has to happen in hidden dimension." + assert ( + moe_layer.shared_experts.linear_fc2.weight.shape[0] == config.hidden_size + ), "Shared expert computation has to happen in hidden dimension." + if grouped_gemm: + for i in range(num_moe_experts): + fc1_weight = getattr(moe_layer.experts.linear_fc1, f"weight{i}") + fc2_weight = getattr(moe_layer.experts.linear_fc2, f"weight{i}") + assert ( + fc1_weight.shape[1] == config.moe_latent_size + ), f"Shape mismatch for expert {i} {fc1_weight.shape=}" + assert ( + fc2_weight.shape[0] == config.moe_latent_size + ), f"Shape mismatch for expert {i} {fc2_weight.shape=}" + else: + for i in range(num_moe_experts): + expert = moe_layer.experts.local_experts[i] + assert ( + expert.linear_fc1.weight.shape[1] == config.moe_latent_size + ), f"Shape mismatch for expert {i} {fc1_weight.shape=}" + assert ( + expert.linear_fc2.weight.shape[0] == config.moe_latent_size + ), f"Shape mismatch for expert {i} {fc2_weight.shape=}" + assert ( + moe_layer.router.weight.shape[1] == config.hidden_size + ), "MoE routing has to happen in hidden dimension." + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones((32, 2, config.hidden_size)) + hidden_states = hidden_states.cuda() + output, _ = moe_layer(hidden_states) + assert output.shape[2] == config.hidden_size + + Utils.destroy_model_parallel() From 002827372080437b0a6896d597eaf15cd17d1f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 3 Feb 2026 15:01:09 +0100 Subject: [PATCH 035/231] ci: Enable unit tests on merge-queue (#3186) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- ...k_api_backwards_compatibility_workflow.yml | 276 ------------------ .github/workflows/cicd-main.yml | 2 - 2 files changed, 278 deletions(-) delete mode 100644 .github/workflows/check_api_backwards_compatibility_workflow.yml diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml deleted file mode 100644 index 44340bdedc5..00000000000 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ /dev/null @@ -1,276 +0,0 @@ -# Temporarily disable this check until we can enforce it on PRs -# -# name: API Compatibility Check - -# on: -# push: -# branches: -# - dev -# - main -# - 'pull-request/[0-9]+' -# - 'deploy-release/*' -# merge_group: -# types: [checks_requested] - -# # Allow manual trigger -# workflow_dispatch: -# inputs: -# baseline: -# description: 'Baseline git reference (tag/branch/commit)' -# required: true - -# jobs: -# pre-flight: -# name: Pre-flight check -# runs-on: ubuntu-latest -# outputs: -# should_skip: ${{ steps.check_files.outputs.should_skip }} -# steps: -# - name: Checkout code -# uses: actions/checkout@v4 -# with: -# fetch-depth: 0 - -# - name: Check if relevant files changed -# id: check_files -# run: | -# # For manual triggers, never skip -# if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then -# echo "should_skip=false" >> $GITHUB_OUTPUT -# echo "Manual trigger - will run compatibility check" -# exit 0 -# fi - -# # Determine base SHA based on event type -# if [ "${{ github.event_name }}" == "merge_group" ]; then -# BASE_SHA="${{ github.event.merge_group.base_sha }}" -# echo "Merge group event - comparing against base: $BASE_SHA" -# else -# # For push events, use merge-base to find common ancestor -# # This ensures we only detect changes actually made in this PR branch, -# # not changes that happened in main after the branch was created -# BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "") -# if [ -z "$BASE_SHA" ]; then -# # Fallback for pull-request/* branches targeting dev -# BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "") -# fi -# echo "Push event - comparing against merge-base: $BASE_SHA" -# fi - -# if [ -z "$BASE_SHA" ]; then -# echo "Could not determine base SHA - will run compatibility check" -# echo "should_skip=false" >> $GITHUB_OUTPUT -# exit 0 -# fi - -# # Check for changes in megatron/core Python files (excluding tests and legacy) -# # Note: Using both *.py and **/*.py to match files at root and in subdirectories -# CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ -# 'megatron/core/*.py' \ -# 'megatron/core/**/*.py' \ -# ':!megatron/core/tests/**' \ -# ':!megatron/legacy/**' 2>/dev/null || echo "") - -# if [ -z "$CHANGED_FILES" ]; then -# echo "should_skip=true" >> $GITHUB_OUTPUT -# echo "No relevant megatron/core files changed - will skip compatibility check" -# else -# echo "should_skip=false" >> $GITHUB_OUTPUT -# echo "Relevant files changed:" -# echo "$CHANGED_FILES" -# fi - -# check-compatibility: -# needs: [pre-flight] -# if: needs.pre-flight.outputs.should_skip != 'true' -# name: "OPTIONAL: Check API Backward Compatibility" -# runs-on: ubuntu-latest - -# # ============================================================================ -# # Configuration Parameters (modify here) -# # ============================================================================ -# env: -# # Default baseline for automatic PR checks -# # Can be: branch name (e.g., 'main'), commit hash, or tag -# # Will be resolved to commit hash during execution -# DEFAULT_BASELINE: '5ab481cb45efc72add12f8ba0378e849b3d2bc50' -# # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') -# TAG_PATTERN: 'core_v*' -# # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) -# TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$' -# # ============================================================================ - -# steps: -# - name: Checkout code -# uses: actions/checkout@v4 -# with: -# fetch-depth: 0 # Need full history to access baseline ref - -# - name: Set up Python -# uses: actions/setup-python@v5 -# with: -# python-version: '3.12' - -# - name: Install griffe -# run: | -# python -m pip install --upgrade pip -# python -m pip install griffe -# python -c "import griffe; print('Griffe installed successfully')" -# python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed" -# python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed" - -# - name: Determine baseline reference -# id: baseline -# run: | -# if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then -# # Use manually specified baseline (branch, tag, or commit hash) -# BASELINE_REF="${{ github.event.inputs.baseline }}" -# else -# # Use the configured default baseline -# BASELINE_REF="${{ env.DEFAULT_BASELINE }}" - -# # Uncomment below to auto-detect from tags instead: -# # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1) -# # if [ -z "$BASELINE_REF" ]; then -# # echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2 -# # BASELINE_REF="${{ env.DEFAULT_BASELINE }}" -# # fi -# fi - -# # Resolve baseline to commit hash (works for branches, tags, or commit hashes) -# BASELINE_HASH=$(git rev-parse "$BASELINE_REF") - -# echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT -# echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)" - -# - name: Run compatibility check -# id: compat_check -# run: | -# # Save output to file for later display -# python scripts/check_api_backwards_compatibility.py \ -# --baseline ${{ steps.baseline.outputs.baseline }} \ -# --verbose 2>&1 | tee compat_check_output.txt - -# # Capture exit code -# EXIT_CODE=${PIPESTATUS[0]} -# echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT -# exit $EXIT_CODE -# continue-on-error: true - -# - name: Fail job if breaking changes detected -# if: steps.compat_check.outcome == 'failure' -# run: | -# echo "" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "🔍 WHAT IS THIS CHECK?" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "" -# echo "This check ensures that changes to Megatron Core's public API do not" -# echo "break backward compatibility for users. It compares your PR against" -# echo "the latest stable release to detect breaking changes in:" -# echo "" -# echo " • Function signatures (parameters, order, types)" -# echo " • Class structures and methods" -# echo " • Return types and public interfaces" -# echo "" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "🛠️ HOW TO FIX THIS" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "" -# echo "Choose ONE of these resolution strategies:" -# echo "" -# echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)" -# echo " → Modify your code to preserve backward compatibility" -# echo " → Add new parameters as optional (with defaults)" -# echo " → Keep existing parameters in the same order" -# echo "" -# echo "2️⃣ MARK AS INTERNAL API (If this is internal code)" -# echo " → Add @internal_api decorator from megatron.core.utils" -# echo "" -# echo " Example (for classes):" -# echo " from megatron.core.utils import internal_api" -# echo "" -# echo " @internal_api" -# echo " class ExperimentalFeature:" -# echo " pass" -# echo "" -# echo " Example (for functions):" -# echo " from megatron.core.utils import internal_api" -# echo "" -# echo " @internal_api" -# echo " def internal_helper_function():" -# echo " pass" -# echo "" -# echo "3️⃣ MARK AS EXPERIMENTAL API (If this is experimental code)" -# echo " → Add @experimental_api decorator from megatron.core.utils" -# echo "" -# echo " Example:" -# echo " from megatron.core.utils import experimental_api" -# echo "" -# echo " @experimental_api" -# echo " class ExperimentalFeature:" -# echo " pass" -# echo "" -# echo "4️⃣ USE DEPRECATION (For gradual API changes)" -# echo " → Add @deprecated decorator for transition period" -# echo " → Example:" -# echo " from megatron.core.utils import deprecated" -# echo "" -# echo " @deprecated(version='1.0', removal_version='2.0'," -# echo " alternative='new_function')" -# echo " def old_function():" -# echo " pass" -# echo "" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "📋 BREAKING CHANGES DETECTED" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "" -# cat compat_check_output.txt -# echo "" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "📚 MORE INFORMATION" -# echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -# echo "" -# echo "📖 Full documentation: docs/api-backwards-compatibility-check.md" -# echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py" -# echo "❓ Questions? Check the docs or ask in #megatron-core" -# echo "" - -# echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy." -# exit 1 - -# - name: Success message -# if: steps.compat_check.outcome == 'success' -# run: | -# echo "::notice::✅ No breaking API changes detected!" - -# api-backward-compatibility-summary: -# needs: [pre-flight, check-compatibility] -# runs-on: ubuntu-latest -# name: "OPTIONAL: API Backward Compatibility Check Summary" -# if: always() && !cancelled() -# steps: -# - name: Checkout -# uses: actions/checkout@v4 - -# - name: Validate workflow result -# shell: bash -x -e -u -o pipefail {0} -# env: -# GH_TOKEN: ${{ github.token }} -# SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }} -# run: | -# FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0 - -# if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then -# if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then -# echo "✅ Compatibility check was skipped (no relevant files changed)" -# else -# echo "✅ All checks passed successfully" -# fi -# exit 0 -# else -# echo "❌ Found $FAILED_JOBS failed job(s)" -# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name' -# exit 1 -# fi - diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 16e2051e4e2..a528c921e54 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -199,7 +199,6 @@ jobs: || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) - && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: - name: Get PR info @@ -368,7 +367,6 @@ jobs: || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) - && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() env: PIP_DISABLE_PIP_VERSION_CHECK: 1 From 94c9eae7d03f2d682cbcddf045912432ca8a8449 Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Tue, 3 Feb 2026 10:48:46 -0500 Subject: [PATCH 036/231] Fix seq pack flag in `get_logprobs` (#3206) --- megatron/rl/rl_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 3058db78f41..7194303bd2c 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -1003,7 +1003,7 @@ def logprobs_forward_step(data_iterator, model, is_correction, packing_context=N b_trajs.cuda(), b_posids.cuda(), no_grad=True, - sequence_packing=b_packed_seq_params is not None, + sequence_packing=packing_context is not None, packed_seq_params=b_packed_seq_params, ), None, From b477d12f1e24b06d2a0884e310837e87f698f212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 3 Feb 2026 16:49:10 +0100 Subject: [PATCH 037/231] ci(fix): Parse unit tests in merge-queue (#3224) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a528c921e54..acc8abba9f1 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -336,7 +336,6 @@ jobs: || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) - && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: - name: Checkout From 1a61b77de408c9349e3a8c2fe00a20b4315c59e7 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Wed, 4 Feb 2026 02:48:36 +0800 Subject: [PATCH 038/231] Fix TE 2.12 AllGather CI failure (#3101) Co-authored-by: Jared Casper <155158+jaredcasper@users.noreply.github.com> --- megatron/core/transformer/moe/token_dispatcher.py | 2 +- tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py | 1 - tests/unit_tests/transformer/moe/test_token_dispatcher.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index f2e26c63cf5..1770f9acd41 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -297,7 +297,7 @@ def dispatch_postprocess(self, hidden_states, probs): (permuted_local_hidden_states, _, self.reversed_local_input_permutation_mapping) = permute( hidden_states, self.local_map, - num_out_tokens=tokens_per_expert.sum(), + num_out_tokens=tokens_per_expert.sum().item(), fused=self.config.moe_permute_fusion, ) diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py index aad1fcaca5f..e20e35b84e1 100644 --- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py @@ -95,7 +95,6 @@ def test_capacity_padding_forward_backward(self, tp_size, ep_size, permute_fusio ) container.dispatcher_drop_and_pad_test() - @pytest.mark.flaky_in_dev @pytest.mark.skipif( not is_te_min_version("1.7.0"), reason="TE 1.7.0 is required for MoE with FP8." ) diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 24617952b94..fd6fb7f6d09 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -364,7 +364,6 @@ def setup_method(self, method): def teardown_method(self, method): Utils.destroy_model_parallel() - @pytest.mark.flaky_in_dev @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.internal @pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4), (1, 1)]) @@ -383,7 +382,6 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion): container.dispatcher_dropless_test() - @pytest.mark.flaky_in_dev @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.internal @pytest.mark.parametrize("permute_fusion", permute_fusion_params) From 79e7bfe158b9dc7865327d8ac6d7e62b7385a540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 3 Feb 2026 22:20:52 +0100 Subject: [PATCH 039/231] ci(hotfix): Pin uv (#3233) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index f3e42e5843d..895b6863bef 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -117,8 +117,10 @@ runs: export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) export NCCL_DEBUG=INFO - pip install --no-cache-dir uv - uv sync --only-group test + pip install --no-cache-dir "uv!=0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache --only-group test uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ --scope unit-tests \ --model unit-tests \ @@ -197,8 +199,10 @@ runs: export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) - pip install --no-cache-dir uv - uv sync --only-group test + pip install --no-cache-dir "uv!=0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache --only-group test uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ ${ARGS[@]} \ --model ${{ inputs.model }} \ From 18d69f130bb47c78a589e333d2510cae3d0919ee Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Tue, 3 Feb 2026 16:46:44 -0500 Subject: [PATCH 040/231] Add a unit test to check that RL `get_logprobs` will reuse training cudagraphed forward pass (#3209) --- tests/unit_tests/rl/test_rl_utils.py | 143 ++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests/rl/test_rl_utils.py b/tests/unit_tests/rl/test_rl_utils.py index cff62d40f0e..0570a0cf4bc 100644 --- a/tests/unit_tests/rl/test_rl_utils.py +++ b/tests/unit_tests/rl/test_rl_utils.py @@ -16,10 +16,21 @@ from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer from megatron.core.pipeline_parallel.utils import is_pp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.tensor_parallel.random import ( + initialize_rng_tracker, + model_parallel_cuda_manual_seed, +) from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.cuda_graphs import ( + CudaGraphManager, + _CudagraphGlobalRecord, + create_cudagraphs, + delete_cuda_graphs, +) +from megatron.core.transformer.module import Float16Module from megatron.rl import rl_utils from megatron.rl.agent.api import TokenRollout +from megatron.rl.sequence_packing_utils import get_default_packed_seq_params from megatron.training.arguments import parse_args, validate_args from megatron.training.global_vars import destroy_global_vars, set_global_variables from tests.unit_tests.test_utilities import Utils @@ -82,9 +93,12 @@ def initialize_model_parallel(request, monkeypatch): monkeypatch.setenv("WANDB_MODE", "disabled") monkeypatch.setenv("LOG_TO_WANDB", "false") + initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) + tp, pp = request.param world_size = Utils.world_size Utils.initialize_model_parallel(tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp) + model_parallel_cuda_manual_seed(123) dp = world_size // (tp * pp) yield world_size, dp, tp, pp Utils.destroy_model_parallel() @@ -654,3 +668,130 @@ def get_optimizer_state_devices(): f"Expected GPU memory to increase after restore. " f"After offload: {memory_after_offload}, After restore: {memory_after_restore}" ) + + @pytest.mark.parametrize( + "initialize_model_parallel", + [pytest.param((1, 1), id="tp1-pp1")], + indirect=["initialize_model_parallel"], + ) + def test_get_logprobs_cuda_graphs(self, initialize_model_parallel): + """Test that get_logprobs reuses CUDA graphs created during training forward pass. + + This test verifies that rl_utils.get_logprobs can reuse CUDA graphs by: + 1. Running a training-style forward pass on some model to record CUDA graph runners. + 2. Creating the CUDA graphs. + 3. Running `get_logprobs` to verify it reuses the same forward graph from training. + """ + + num_layers = 2 + + world_size, dp, tp, pp = initialize_model_parallel + self.create_test_args( + tensor_model_parallel_size=tp, + pipeline_model_parallel_size=pp, + rl_training_cuda_graphs=True, + cuda_graph_impl="local", + bf16=True, + rl_sequence_packing_max_sequences_per_bin=4, + ) + + # Create a model with training CUDA graphs enabled + transformer_config = TransformerConfig( + num_layers=num_layers, + hidden_size=64, + num_attention_heads=4, + use_cpu_initialization=True, + cuda_graph_impl="local", + bf16=True, + ) + model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + vocab_size=256, + max_sequence_length=32, + ).cuda() + + # Wrap in Float16Module so it accepts fp32_output argument from get_logprobs + wrapped_model = Float16Module(transformer_config, model) + + # Create test inputs (batch_size=1 required for thd format with sequence packing) + batch_size = 1 + seq_length = 16 + tokens = torch.randint(0, 256, (batch_size, seq_length), dtype=torch.long).cuda() + position_ids = torch.arange(seq_length).unsqueeze(0).expand(batch_size, -1).cuda() + + # Create packed_seq_params for dummy data + packed_seq_params = get_default_packed_seq_params( + seq_length=seq_length, max_sequences_per_bin=4, device=tokens.device + ) + + # Run a single training forward pass to record cudagraphs + output = wrapped_model( + tokens, + position_ids, + attention_mask=None, + packed_seq_params=packed_seq_params, + runtime_gather_output=True, + fp32_output=False, + ) + + # Run backward to reset runner status from BWD_READY back to FWD_READY + # This is needed because get_logprobs runs in no_grad mode and expects FWD_READY + loss = output.sum() + loss.backward() + + # Collect all CudaGraphManager instances and their runners + cudagraph_managers = [] + for module in wrapped_model.modules(): + if hasattr(module, 'cudagraph_manager') and module.cudagraph_manager is not None: + cudagraph_managers.append(module.cudagraph_manager) + + # Record runner count before creating graphs + runners_before = {id(mgr): len(mgr.cudagraph_runners) for mgr in cudagraph_managers} + + create_cudagraphs() + + # Verify that each runner has a fwd_graph created + for mgr in cudagraph_managers: + for runner in mgr.cudagraph_runners: + assert runner.fwd_graph is not None, ( + f"Expected runner to have fwd_graph created after create_cudagraphs(), " + f"but fwd_graph is None" + ) + + # Now test `get_logprobs`; this should reuse the existing CUDA graphs + # We do not pass packed_seq_params; it should be created within `get_logprobs` + logprobs = rl_utils.get_logprobs( + wrapped_model, tokens, position_ids=position_ids, sequence_packing=True + ) + + # Verify that no new runners were created and graph was reused + runners_after = {id(mgr): len(mgr.cudagraph_runners) for mgr in cudagraph_managers} + for mgr_id, count_before in runners_before.items(): + count_after = runners_after[mgr_id] + assert count_after == count_before, ( + f"Expected runner count to remain {count_before} after `get_logprobs`, " + f"but got {count_after}. `get_logprobs` should not create new runners." + ) + + # Verify outputs are valid + assert output is not None, "Training forward pass should return valid output" + assert logprobs is not None, "get_logprobs should return valid output" + + # Destroy all captured graphs deterministically + for l in model.decoder.layers: + for runner in getattr(l.cudagraph_manager, "cudagraph_runners", []): + # Safely delete both graphs if present + if hasattr(runner, "fwd_graph"): + del runner.fwd_graph + if hasattr(runner, "bwd_graph"): + del runner.bwd_graph + + # Ensure all pending work is complete and graph destruction runs now + torch.cuda.synchronize() + + _CudagraphGlobalRecord.cudagraph_created = False + _CudagraphGlobalRecord.cudagraph_record = [] + CudaGraphManager.global_mempool = None + CudaGraphManager.fwd_mempools = None + CudaGraphManager.bwd_mempools = None From 27a5f83eb175903e1437c497ad4e363a7bb6ed4c Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Tue, 3 Feb 2026 17:48:42 -0500 Subject: [PATCH 041/231] Do not offload grad buffers when training graphs are enabled (#3231) --- megatron/rl/rl_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 7194303bd2c..5a7f9809f1a 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -462,7 +462,11 @@ def get_environment_rollouts( if args.rl_offload_optimizer_during_inference: with nvtx_range("offload-optimizer-state-and-grad-buffers-during-inference"): - model[0].offload_grad_buffers() + if not args.rl_training_cuda_graphs: + model[0].offload_grad_buffers() + else: + logger.warning( + "Gradient buffers will not be offloaded when training cudagraphs are enabled!") optimizer.offload_to_cpu() # If we have seperate training and inference models we to refit weights from the training model to the inference model. @@ -1663,7 +1667,11 @@ def megatron_rl_inference_mode( if offload_optimizer_during_inference: with nvtx_range("offload-optimizer-state-and-grad-buffers-before-inference"): - model[0].offload_grad_buffers() + if not args.rl_training_cuda_graphs: + model[0].offload_grad_buffers() + else: + logger.warning( + "Gradient buffers will not be offloaded when training cudagraphs are enabled!") optimizer.offload_to_cpu() # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to. From bc2eb9a0eadbb5eab63a0aa97b04d6762c1b0728 Mon Sep 17 00:00:00 2001 From: Parth Mannan <38387286+parthmannan@users.noreply.github.com> Date: Tue, 3 Feb 2026 14:53:53 -0800 Subject: [PATCH 042/231] Fix missing PackedSeqParams import (#3214) --- megatron/core/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/utils.py b/megatron/core/utils.py index d7b702f25ec..dac1beb7075 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -46,6 +46,7 @@ from megatron.core import config from megatron.core._rank_utils import log_single_rank from megatron.core.package_info import __version__ as mcore_version +from megatron.core.packed_seq_params import PackedSeqParams try: from torch.distributed._tensor import DTensor From 1fdb29f763c9ce76533a74bcb33c503247b1c3b2 Mon Sep 17 00:00:00 2001 From: Keshav Santhanam Date: Tue, 3 Feb 2026 15:10:48 -0800 Subject: [PATCH 043/231] Synchronize the request counts for EP inference with strict matching (#3033) Signed-off-by: Keshav Santhanam --- .../core/inference/batch_dimensions_utils.py | 17 ++++++++-- .../core/inference/engines/dynamic_engine.py | 4 +++ megatron/core/ssm/mamba_block.py | 31 +++++++++++++++++++ megatron/core/ssm/mamba_layer.py | 1 + .../core/transformer/transformer_block.py | 11 +------ .../core/transformer/transformer_layer.py | 17 ---------- 6 files changed, 52 insertions(+), 29 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index e6ea32a6df8..a2f10c6d11b 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -183,6 +183,8 @@ def adjust_batch_dims_for_expert_parallelism( local_batch_dims.token_count, int(is_non_decode), int(has_explicit_chunked_prefill_req), + local_batch_dims.prefill_req_count, + local_batch_dims.decode_req_count, ], dtype=torch.int32, device=torch.cuda.current_device(), @@ -208,10 +210,21 @@ def adjust_batch_dims_for_expert_parallelism( return None # indicate no match, run in eager mode assert not has_explicit_chunked_prefill_req + + # If strict matching is enabled, we sync the request counts across EP ranks + # to ensure the graph captures the maximum needed capacity. + # TODO(ksanthanam): Add functional test for this scenario + adjusted_prefill_req_count = ( + int(sync_tensor[3].item()) if strict else local_batch_dims.prefill_req_count + ) + adjusted_decode_req_count = ( + int(sync_tensor[4].item()) if strict else local_batch_dims.decode_req_count + ) + adjusted_batch_dim = InferenceBatchDimensions( token_count=int(sync_tensor[0].item()), - prefill_req_count=local_batch_dims.prefill_req_count, - decode_req_count=local_batch_dims.decode_req_count, + prefill_req_count=adjusted_prefill_req_count, + decode_req_count=adjusted_decode_req_count, has_explicit_chunked_prefill_req=False, ) return adjusted_batch_dim diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index a09d15ae20b..ce1802c9988 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -1706,7 +1706,11 @@ async def run_engine_with_coordinator( if ep_group_has_work and local_pending_requests == 0: # run dummy forward pass if EP group as a whole has work, # but this rank does not have any work. + self.step_start_event.record() self.controller.dummy_forward() + self.step_end_event.record() + self.step_end_event.synchronize() + self.step_count += 1 continue # 3. No work in EP group diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index ef41faae143..3d684b82dce 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -202,6 +202,37 @@ def mamba_state_shapes_per_request(self) -> Optional[Tuple[Tuple[int], Tuple[int return layer.mamba_state_shapes_per_request() return None + def _should_call_local_cudagraph(self, *args, **kwargs): + """ + Check if we should call the local cudagraph path. + """ + if not self.training and ( + hasattr(self, 'cudagraph_manager') + and kwargs['attention_mask'] is None + and ( + kwargs.get('inference_context') is not None + or kwargs.get('inference_params') is not None + ) + and CudaGraphScope.full_iteration in self.config.cuda_graph_scope + ): + if kwargs['inference_context'].is_static_batching(): + using_cuda_graph = kwargs['inference_context'].is_decode_only() + else: + using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step() + + if using_cuda_graph: + return True + return False + + def __call__(self, *args, **kwargs): + if self._should_call_local_cudagraph(*args, **kwargs): + kwargs['hidden_states'] = ( + kwargs['hidden_states'].unwrap() + if isinstance(kwargs['hidden_states'], WrappedTensor) + else kwargs['hidden_states'] + ) + return super().__call__(*args, **kwargs) + def forward( self, hidden_states: Union[Tensor, WrappedTensor], diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index ac6e8b5bf40..2494126d3ab 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -192,6 +192,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): hasattr(self, 'cudagraph_manager') and kwargs.get('attention_mask') is None and kwargs.get('inference_context') is not None + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ): using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step() return using_cuda_graph diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 831b5546d53..73a724c3e91 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -606,16 +606,7 @@ def __call__(self, *args, **kwargs): if isinstance(kwargs['hidden_states'], WrappedTensor) else kwargs['hidden_states'] ) - # dynamic_inference_decode_only is not a real argument to forward, it is only used - # to differentiate the cuda graph used for decode from the one used for non-decode - # inference. - dynamic_inference_decode_only = kwargs['inference_context'].is_decode_only() - # cudagraphmanager returns a singleton tuple, whereas the - # normal forward returns a tensor, therefore we need - # to extract the tensor from the tuple - return super().__call__( - *args, dynamic_inference_decode_only=dynamic_inference_decode_only, **kwargs - )[0] + return super().__call__(*args, **kwargs)[0] return super().__call__(*args, **kwargs) def forward( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index a5eaec92866..9a3b69e8a77 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -506,10 +506,6 @@ def forward(self, *args, **kwargs): This method calls the core computation of a transformer layer, including self-attention, cross-attention (if applicable), and feed-forward operations. """ - # Remove 'dynamic_inference_decode_only' from kwargs if present - # this is only used to uniquely identify decode and non-decode cuda graph - # runners in the cuda graph manager - kwargs.pop("dynamic_inference_decode_only", None) hidden_states, context = self._forward_attention(*args, **kwargs) output = self._forward_mlp( hidden_states, @@ -1203,19 +1199,6 @@ def _should_call_local_cudagraph(self, *args, **kwargs): return True return False - def __call__(self, *args, **kwargs): - if self._should_call_local_cudagraph(*args, **kwargs): - # Inference mode. - if kwargs.get('inference_context') is not None: - # dynamic_inference_decode_only is not a real argument to forward, it is only used - # to differentiate the cuda graph used for decode from the one used for non-decode - # inference. - kwargs["dynamic_inference_decode_only"] = kwargs[ - 'inference_context' - ].is_decode_only() - - return super().__call__(*args, **kwargs) - def get_layer_norm_weights(self): """ Get the weights of all layernorms (attention and MLP) in the transformer layer. From 4c48248c91fee4549d1aa256a4efecb791156a92 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Wed, 4 Feb 2026 00:12:42 +0000 Subject: [PATCH 044/231] Update copy-pr-bot.yaml [skip ci] --- .github/copy-pr-bot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 72a5b915ecc..8998eabe3a2 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] From 9050d5b4ac3a460a359afb785cad0f9b40c28a3d Mon Sep 17 00:00:00 2001 From: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com> Date: Tue, 3 Feb 2026 15:39:23 -0800 Subject: [PATCH 045/231] Fix coordinator address collision check in flask (#3208) --- .../dynamic_text_gen_server/flask_server.py | 4 ++-- tools/run_dynamic_text_generation_server.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py index 2b0469b340a..1701ff63c36 100644 --- a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py +++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py @@ -33,7 +33,7 @@ def temp_log_level(level, logger=None): @trace_async_exceptions -async def run_flask_server(coordinator_port: int, tokenizer, rank: int, flask_port: int): +async def run_flask_server(coordinator_addr: str, tokenizer, rank: int, flask_port: int): """Initializes and runs the async Flask server.""" if not HAS_FLASK: raise RuntimeError(f"Flask not available") @@ -44,7 +44,7 @@ async def run_flask_server(coordinator_port: int, tokenizer, rank: int, flask_po logger.warning(f"Could not get hostname: {e}") hostname = "0.0.0.0" - inference_client = InferenceClient(coordinator_port) + inference_client = InferenceClient(coordinator_addr) await inference_client.start() logger.info(f"Rank {rank}: InferenceClient connected.") diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py index 615073b8fd0..a6b1b5c8398 100644 --- a/tools/run_dynamic_text_generation_server.py +++ b/tools/run_dynamic_text_generation_server.py @@ -42,7 +42,7 @@ async def run_text_generation_server( rank = torch.distributed.get_rank() - await engine.start_listening_to_data_parallel_coordinator( + coordinator_addr = await engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=coordinator_port, launch_inference_coordinator=True ) @@ -50,7 +50,7 @@ async def run_text_generation_server( if rank == 0: server_task = asyncio.create_task( run_flask_server( - coordinator_port=coordinator_port, + coordinator_addr=coordinator_addr, tokenizer=engine.controller.tokenizer, rank=rank, flask_port=flask_port, From e02344ec4c28744a4cd17c7ab7b3b5a9583d33a3 Mon Sep 17 00:00:00 2001 From: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com> Date: Tue, 3 Feb 2026 15:57:55 -0800 Subject: [PATCH 046/231] Do not let requests fail silently inside inference engine (#3228) --- megatron/core/inference/engines/dynamic_engine.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index ce1802c9988..29257b6f982 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -784,6 +784,10 @@ def _add_request( self.waiting_request_ids.append(request_id) else: self.failed_request_ids.append(request_id) + if self.rank == 0: + warnings.warn( + f"Request {request_id} failed to be added to the engine due to errors." + ) return self.requests[request_id].future From cd5ed741036da8c6818cd932a34e5445fef5cdf0 Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Tue, 3 Feb 2026 19:25:11 -0500 Subject: [PATCH 047/231] torch saver inference model offload (#3170) --- megatron/rl/rl_utils.py | 55 ++++++++++++++++++++++++++++++---- megatron/training/arguments.py | 26 +++++++++------- megatron/training/training.py | 35 +++++++++++++++++++--- 3 files changed, 97 insertions(+), 19 deletions(-) diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 5a7f9809f1a..3ea43103215 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -94,16 +94,61 @@ _GLOBAL_PACKING_CONTEXT = None +# Track whether the inference model is currently paused (offloaded to CPU). +# Model starts on GPU after creation and is used immediately, so starts as False. +_INFERENCE_MODEL_IS_PAUSED = False + + +def _torch_saver_swap_inference_model(*, to_cpu: bool) -> None: + """Swap RL inference model weights between CPU and GPU using torch_memory_saver. + + Uses torch_memory_saver.pause()/resume() to transfer inference model weights + that were allocated within a torch_memory_saver.region() context. + + Args: + to_cpu: If True, move weights to CPU (pause). If False, restore weights to GPU (resume). + """ + global _INFERENCE_MODEL_IS_PAUSED + + if not HAVE_TORCH_MEMORY_SAVER: + raise RuntimeError( + "torch_memory_saver is required for inference model offloading when not using UVM. " + "Please install it: pip install torch_memory_saver " + "(see https://github.com/fzyzcjy/torch_memory_saver)" + ) + + if to_cpu: + if not _INFERENCE_MODEL_IS_PAUSED: + torch_memory_saver.pause("rl_inference_model") + _INFERENCE_MODEL_IS_PAUSED = True + print_rank_0("[Rank 0] offloaded RL inference model weights to CPU using torch_memory_saver") + else: + if _INFERENCE_MODEL_IS_PAUSED: + torch_memory_saver.resume("rl_inference_model") + _INFERENCE_MODEL_IS_PAUSED = False + print_rank_0("[Rank 0] restored RL inference model weights to GPU using torch_memory_saver") + + def _maybe_prefetch_separate_inference_model_weights(model_core, *, to_cpu: bool) -> None: - """Prefetch RL *separate inference model* weights to CPU/GPU (UVM-only path). + """Prefetch RL *separate inference model* weights to CPU/GPU. + + Supports two modes: + 1. UVM-based offloading (when --rl-inference-model-unified-memory-level=1) + 2. torch_memory_saver-based offloading (when offloading is enabled but UVM is not) - Gated only by user args; this assumes the separate inference model was allocated with UVM when enabled. + Gated by user args; this assumes the separate inference model was allocated + with UVM or torch_memory_saver when enabled. """ args = get_args() if not args.rl_offload_inference_model_weights_when_idle: return + + # Check for torch_memory_saver path (when offloading is enabled but UVM is not) if args.rl_inference_model_unified_memory_level != 1: + _torch_saver_swap_inference_model(to_cpu=to_cpu) return + + # UVM-based path (when UVM level is 1) device = -1 if to_cpu else int(torch.cuda.current_device()) # Note: include_buffers=False because buffers created with explicit device= in register_buffer() # are not allocated via the UVM mempool and will fail UVM operations. Only parameters are UVM-allocated. @@ -1650,10 +1695,10 @@ def megatron_rl_inference_mode( lang_module = model[0].module.module if hasattr(model[0].module, "module") else model[0].module lang_module.eval() - # If this is a separate RL inference model allocated with UVM, ensure weights are resident on GPU - # before any CUDA-graph capture/replay or inference. + # If this is a separate RL inference model with offloading enabled, ensure weights are on GPU + # before any CUDA-graph capture/replay or inference. This is a no-op if already on GPU. + model_core = unwrap_model(model[0]) with nvtx_range("prefetch-inference-model-weights-to-gpu"): - model_core = unwrap_model(model[0]) _maybe_prefetch_separate_inference_model_weights(model_core, to_cpu=False) rotary_module = getattr(lang_module, "rotary_pos_emb", None) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 5749d20a4ca..28603b02ed5 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -343,13 +343,18 @@ def validate_args(args, defaults={}): assert not (args.rl_partial_rollouts and args.rl_remove_kv_cache_during_training), \ "Cannot use both partial-rollouts and remove-kv-cache-during-training" - assert not ( - args.rl_offload_inference_model_weights_when_idle - and args.rl_inference_model_unified_memory_level != 1 - ), ( - "--rl-offload-inference-model-weights-when-idle requires " - "--rl-inference-model-unified-memory-level=1." - ) + # Validate inference model offloading - requires either UVM or torch_memory_saver + if args.rl_offload_inference_model_weights_when_idle: + if args.rl_inference_model_unified_memory_level != 1: + # Not using UVM, so we need torch_memory_saver + try: + from torch_memory_saver import torch_memory_saver + except ImportError: + raise AssertionError( + "To use --rl-offload-inference-model-weights-when-idle without UVM " + "(--rl-inference-model-unified-memory-level=1), `torch_memory_saver` must be " + "installed. See https://github.com/fzyzcjy/torch_memory_saver." + ) # When using different EP sizes for inference and training (EP refit), the legacy # GroupedMLP is not supported. Only SequentialMLP or TEGroupedMLP can be used. @@ -1974,9 +1979,10 @@ def _add_rl_args(parser): required=False, default=False, help=( - 'When using a separate RL inference model with UVM-enabled parameters, prefetch its weights ' - 'to CPU when not doing rollout inference, and prefetch back to GPU right before inference. ' - 'Requires --rl-inference-model-unified-memory-level=1.' + 'When using a separate RL inference model, offload its weights to CPU when not doing rollout ' + 'inference, and restore to GPU right before inference. Works with two backends: ' + '1) UVM (when --rl-inference-model-unified-memory-level=1), or ' + '2) torch_memory_saver (when UVM is not enabled; requires torch_memory_saver to be installed).' ), ) group.add_argument('--refit-method', type=str, default='gloo', diff --git a/megatron/training/training.py b/megatron/training/training.py index 7c394c7b266..563c228367f 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -152,6 +152,13 @@ def set_startup_timestamps(program_start=None, main_entry=None): from megatron.core.inference.unified_memory import create_unified_mempool from megatron.core.resharding.refit import swap_model_weights +try: + from torch_memory_saver import torch_memory_saver + torch_memory_saver.hook_mode = "torch" + HAVE_TORCH_MEMORY_SAVER = True +except ImportError: + HAVE_TORCH_MEMORY_SAVER = False + from megatron.core.pipeline_parallel import get_forward_backward_func from megatron.core.num_microbatches_calculator import ( destroy_num_microbatches_calculator, @@ -965,15 +972,30 @@ def pretrain( # Optionally allocate the RL inference model weights from a unified virtual memory (UVM) # mempool so we can prefetch weights to CPU when idle while keeping CUDA-graph-safe pointers. + # Alternatively, use torch_memory_saver to offload the weights to CPU when idle. uvm_mempool = None uvm_level = args.rl_inference_model_unified_memory_level if uvm_level and uvm_level > 0: uvm_mempool = create_unified_mempool() - mempool_ctx = ( - torch.cuda.use_mem_pool(uvm_mempool) if uvm_mempool is not None else nullcontext() + # Determine which context manager to use for model allocation + # Use torch_memory_saver if offloading is requested but UVM is not enabled + use_torch_saver_for_inference_model = ( + args.rl_offload_inference_model_weights_when_idle + and uvm_level == 0 + and HAVE_TORCH_MEMORY_SAVER ) - with mempool_ctx: + if use_torch_saver_for_inference_model: + # Use torch_memory_saver for offloading - allocate within a tagged region + model_alloc_ctx = torch_memory_saver.region( + tag="rl_inference_model", enable_cpu_backup=True + ) + elif uvm_mempool is not None: + model_alloc_ctx = torch.cuda.use_mem_pool(uvm_mempool) + else: + model_alloc_ctx = nullcontext() + + with model_alloc_ctx: inference_model = get_model( model_provider, model_type, @@ -983,7 +1005,12 @@ def pretrain( ) inference_model[0].eval() - + # Validate: offloading flag requires a separate inference model + if args.rl_offload_inference_model_weights_when_idle and inference_model is None: + raise ValueError( + "--rl-offload-inference-model-weights-when-idle requires a separate inference model. " + "This flag is only useful when doing refit since the weights are shared with the training model." + ) # Data stuff. app_metrics['app_build_dataiters_start_time'] = one_logger_utils.get_timestamp_in_ms() From 982ca5d8a1902d55cbc2fd06ae0f21d74ac96cf0 Mon Sep 17 00:00:00 2001 From: Tong Liu Date: Wed, 4 Feb 2026 11:43:56 +0800 Subject: [PATCH 048/231] enable cuda graph ut (#3197) --- tests/unit_tests/transformer/test_cuda_graphs.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index 4696a3ed439..325994cbf89 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -1039,10 +1039,6 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa extra_kwargs["moe_token_dispatcher_type"] = "flex" extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" elif moe_dispatcher_type == "hybridep": - pytest.skip( - "Currently, the Hybrid EP is broken. " - "Temporarily skip the test and wait for the fix." - ) if not is_hybrid_ep_available(): pytest.skip("Hybrid EP is not available") extra_kwargs["moe_token_dispatcher_type"] = "flex" @@ -1052,8 +1048,6 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa if not moe_dropless_dispatcher: if moe_dispatcher_type == "deepep": pytest.skip("Deep EP doesn't support drop&pad MoE") - if moe_dispatcher_type == "hybridep" and ep_size == 1: - pytest.skip("Hybrid EP doesn't support drop&pad MoE with ep_size == 1") extra_kwargs["moe_expert_capacity_factor"] = 1.0 extra_kwargs["moe_pad_expert_input_to_capacity"] = True From 473e283ccacbaa252565ace5beea9e2dccd15821 Mon Sep 17 00:00:00 2001 From: Li Jinliang Date: Wed, 4 Feb 2026 12:22:58 +0800 Subject: [PATCH 049/231] Support EP with HSDP (#2840) Signed-off-by: jinliangl Co-authored-by: Jinliang Li Co-authored-by: Jinliang Li --- .../distributed/fsdp/mcore_fsdp_adapter.py | 54 ++++++++++++++++--- megatron/core/distributed/fsdp/src/README.md | 14 +++-- .../fsdp/src/megatron_fsdp/fully_shard.py | 5 ++ .../megatron_fsdp/param_and_grad_buffer.py | 4 +- .../fsdp/src/megatron_fsdp/utils.py | 42 ++++++++++++--- 5 files changed, 99 insertions(+), 20 deletions(-) diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index d6384e70488..5bf543fdc5c 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -212,6 +212,13 @@ def _init_dist_index(self, pg_collection): hybrid_fsdp_group = parallel_state.get_data_parallel_group( with_context_parallel=True, partial_data_parallel=False ) + expt_dp_group = parallel_state.get_expert_data_parallel_group( + partial_expert_data_parallel=True + ) + hybrid_fsdp_expt_group = parallel_state.get_expert_data_parallel_group( + partial_expert_data_parallel=False + ) + ep_group = parallel_state.get_expert_model_parallel_group() else: dp_cp_group = parallel_state.get_data_parallel_group( with_context_parallel=True, partial_data_parallel=False @@ -227,6 +234,10 @@ def _init_dist_index(self, pg_collection): dp_cp_group = pg_collection.intra_dp_cp outer_fsdp_group = pg_collection.inter_dist_opt hybrid_fsdp_group = pg_collection.dp_cp + # This has not been tested yet. + expt_dp_group = getattr(pg_collection, 'intra_expt_dp', None) + hybrid_fsdp_expt_group = getattr(pg_collection, 'expt_dp', None) + ep_group = getattr(pg_collection, 'ep', None) else: dp_cp_group = pg_collection.dp_cp outer_fsdp_group = None @@ -243,6 +254,18 @@ def _init_dist_index(self, pg_collection): expt_tp_group = single_rank_group if enable_hsdp: + if expt_dp_group is not None: + expt_mesh = _get_hsdp_tp_mesh( + outer_fsdp_group, expt_dp_group, expt_tp_group, ep_size=ep_group.size() + ) + expt_device_mesh = DeviceMesh.from_group( + [outer_fsdp_group, expt_dp_group, expt_tp_group], + device_type="cuda", + mesh=expt_mesh.tolist(), + mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"], + ) + else: + expt_device_mesh = None mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard", @@ -256,6 +279,8 @@ def _init_dist_index(self, pg_collection): dp_shard_dim="dp_cp", tp_dim="tp", hybrid_fsdp_group=hybrid_fsdp_group, + hybrid_fsdp_expt_group=hybrid_fsdp_expt_group, + expt_device_mesh=expt_device_mesh, ) else: if ep_group is not None: @@ -308,22 +333,24 @@ def sync_rng_states_across_tp_group(self): _load_rng_state_dict(broadcast_list[0]) -def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): +def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group, ep_size=1): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." world_size = dist.get_world_size() mesh = einops.rearrange( torch.arange(world_size), - "(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp", + "(outer_fsdp_dp fsdp ep tp) -> ep outer_fsdp_dp fsdp tp", outer_fsdp_dp=outer_fsdp_dp_group.size(), tp=tp_group.size(), + ep=ep_size, ) mesh_fsdp_ranks = einops.rearrange( mesh, - 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp', + 'ep outer_fsdp_dp fsdp tp -> (outer_fsdp_dp ep tp) fsdp', tp=tp_group.size(), fsdp=dp_cp_group.size(), + ep=ep_size, ) fsdp_group_ranks = dist.get_process_group_ranks(dp_cp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_fsdp_ranks, fsdp_group_ranks), ( @@ -333,7 +360,7 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): mesh_tp_ranks = einops.rearrange( mesh, - 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp', + 'ep outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp ep) tp', tp=tp_group.size(), fsdp=dp_cp_group.size(), ) @@ -345,9 +372,10 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): mesh_outer_fsdp_dp_ranks = einops.rearrange( mesh, - 'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp', + 'ep outer_fsdp_dp fsdp tp -> (fsdp ep tp) outer_fsdp_dp', tp=tp_group.size(), fsdp=dp_cp_group.size(), + ep=ep_size, ) outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent( @@ -357,7 +385,21 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}." ) - return mesh + # Exclude the expert parallel dimension + rank = dist.get_rank() + dp_tp_meshes = [per_ep_mesh for per_ep_mesh in mesh if rank in per_ep_mesh.reshape(-1).tolist()] + assert ( + len(dp_tp_meshes) == 1 + ), f"[Megatron-FSDP] Current rank {rank} is not unique in the mesh ranks {mesh.tolist()}." + assert ( + len(dp_tp_meshes[0].reshape(-1).tolist()) + == outer_fsdp_dp_group.size() * dp_cp_group.size() * tp_group.size() + ), ( + f"[Megatron-FSDP] DP-TP mesh size {len(dp_tp_meshes[0].reshape(-1).tolist())} " + f"does not match the expected size" + f"{outer_fsdp_dp_group.size() * dp_cp_group.size() * tp_group.size()}." + ) + return dp_tp_meshes[0] def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1): diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index bc4cdaa078e..75cb7c45613 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -156,12 +156,13 @@ device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp") hsdp_group = device_mesh["hsdp"].get_group() # Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP. -expt_device_mesh = DeviceMesh.from_group( - [expt_dp_group, expt_tp_group], - device_type="cuda", - mesh=expt_mesh.tolist(), - mesh_dim_names=["dp_shard_cp", "tp"], +expert_device_mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + mesh_shape=(dp_outer_size, expt_dp_shard_size, expt_tp_size), + mesh_dim_names=("dp_outer", "dp_shard_cp", "tp"), ) +expert_device_mesh[("dp_outer", "dp_shard_cp")].flatten("hsdp") +hsdp_expt_group = expert_device_mesh["hsdp"].get_group() ``` ### Convert models into fully-sharded `MegatronFSDP` models with `fully_shard_model`. @@ -186,6 +187,8 @@ model = fully_shard_model( tp_dim="tp", # Only required when using HSDP. Otherwise, set this to None. hybrid_fsdp_group=hsdp_group, + # Only required when using HSDP + EP. Otherwise, set this to None. + hybrid_fsdp_expt_group=hsdp_expt_group, # Only required for FSDP + EP. Otherwise, set this to None. expt_device_mesh=expt_device_mesh, # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3) @@ -295,6 +298,7 @@ Megatron-FSDP's `fully_shard_*` API has a comprehensive set of arguments for fin - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP. - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053). - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded mesh coordinates for the weight and gradient buffers. Required for HSDP. + - `hybrid_fsdp_expt_group` defines the data-parallel communication group for expert parameters. It is required for HSDP. - `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`. - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP). - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py index bcfbefcbd3b..9b89055d450 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py @@ -77,6 +77,7 @@ def fully_shard_model( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + hybrid_fsdp_expt_group: Optional[torch.distributed.ProcessGroup] = None, expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, @@ -358,6 +359,8 @@ class that schedules the sharding lifecycle of the model parameters and gradient tp_dim=tp_dim, # Only required for HSDP. hybrid_fsdp_group=hybrid_fsdp_group, + # Only required for HSDP + EP. + hybrid_fsdp_expt_group=hybrid_fsdp_expt_group, # Access to flattened DP rank assignments for HSDP. hsdp_outer_dp_shard=_outer_fsdp_sharding, # Only required for Megatron-FSDP + EP. @@ -527,6 +530,7 @@ def fully_shard( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + hybrid_fsdp_expt_group: Optional[torch.distributed.ProcessGroup] = None, expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, @@ -575,6 +579,7 @@ def fully_shard( dp_outer_dim=dp_outer_dim, tp_dim=tp_dim, hybrid_fsdp_group=hybrid_fsdp_group, + hybrid_fsdp_expt_group=hybrid_fsdp_expt_group, expt_device_mesh=expt_device_mesh, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=zero_dp_strategy, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index fe25026b22e..a0133912069 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -1954,7 +1954,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): hsdp_buf_dp_group = self.dist_index.get_fsdp_group( is_expert_parallel=group.is_expert_param ) - main_buf_extra_kwargs["dp_rank"] = self.dist_index.get_logical_hybrid_fsdp_rank() + main_buf_extra_kwargs["dp_rank"] = self.dist_index.get_logical_hybrid_fsdp_rank( + is_expert_parallel=group.is_expert_param + ) else: main_buf_dp_group = self.dist_index.get_fsdp_group( is_expert_parallel=group.is_expert_param diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index d5fbc91fcf8..ad29956e1b0 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -450,6 +450,7 @@ def __init__( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + hybrid_fsdp_expt_group: Optional[torch.distributed.ProcessGroup] = None, hsdp_outer_dp_shard: bool = False, expt_device_mesh: Optional[DeviceMesh] = None, ): @@ -464,6 +465,9 @@ def __init__( hybrid_fsdp_group (Optional[torch.distributed.ProcessGroup]): The process group for hybrid FSDP communication, which is the flattened combination of the dp_outer and dp_shard process groups. + hybrid_fsdp_expt_group (Optional[torch.distributed.ProcessGroup]): The + process group for hybrid FSDP expert communication, which is the flattened + combination of the expert dp_outer and expert dp_shard process groups. hsdp_outer_dp_shard (bool): Whether to have outer DP group sharding in hybrid FSDP. Specifying outer sharding will lift the bucket sharding coordinate system to flattened ranks of (dp_shard, dp_outer) instead of @@ -509,6 +513,7 @@ def __init__( # Save a reference to the overall HSDP process group, which is the flattened # combination of the outer-FSDP and FSDP process groups. self.hybrid_fsdp_group = hybrid_fsdp_group + self.hybrid_fsdp_expt_group = hybrid_fsdp_expt_group # Retrieve the expert parallel process groups from the DeviceMesh. self.expt_fsdp_group = ( @@ -518,6 +523,13 @@ def __init__( else None ) + self.expt_outer_fsdp_group = ( + self.expt_device_mesh[self.dp_outer_dim].get_group() + if self.expt_device_mesh is not None + and contains_submesh(self.expt_device_mesh, self.dp_outer_dim) + else None + ) + """ Megatron-FSDP is responsible for storing all required DeviceMesh as per best practices recommended by the DeviceMesh API. @@ -558,6 +570,8 @@ def register_submesh(device_mesh, submesh, is_expert_parallel): register_submesh(self.expt_device_mesh, tp_submesh, True) register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True) register_submesh(self.expt_device_mesh, fsdp_submesh, True) + register_submesh(self.expt_device_mesh, hsdp_submesh, True) + register_submesh(self.expt_device_mesh, hsdp_tp_submesh, True) # Validate FSDP arguments. if self.fsdp_group is None: @@ -629,6 +643,8 @@ def get_submesh( def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the data parallel process group.""" if is_expert_parallel: + if self.use_hybrid_fsdp: + return self.hybrid_fsdp_expt_group return self.expt_fsdp_group if self.use_hybrid_fsdp: return self.hybrid_fsdp_group @@ -644,10 +660,12 @@ def get_fsdp_group( return self.fsdp_group_ag return self.fsdp_group - def get_outer_fsdp_group(self) -> ProcessGroup: + def get_outer_fsdp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the outer-FSDP process group.""" if not self.use_hybrid_fsdp: return None + if is_expert_parallel: + return self.expt_outer_fsdp_group return self.outer_fsdp_group def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: @@ -659,7 +677,7 @@ def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: return self.expt_device_mesh return self.device_mesh - def get_logical_hybrid_fsdp_rank(self): + def get_logical_hybrid_fsdp_rank(self, is_expert_parallel: bool = False): """ Returns the logical rank of the current process within the full-shard hybrid FSDP group. @@ -679,20 +697,28 @@ def get_logical_hybrid_fsdp_rank(self): self.hsdp_outer_dp_shard ), "get_logical_hybrid_fsdp_rank is only valid when full-shard hybrid FSDP is enabled." - if not hasattr(self, "_hybrid_fsdp_group_ranks"): - dp_world_size = self.get_dp_group().size() + _hybrid_fsdp_group_name = ( + "_hybrid_fsdp_group_ranks" + if not is_expert_parallel + else "_hybrid_fsdp_expt_group_ranks" + ) + + if not hasattr(self, _hybrid_fsdp_group_name): + dp_world_size = self.get_dp_group(is_expert_parallel).size() # Reorder the flat ranks: (outer_dp, inner_dp) -> (inner_dp, outer_dp) mesh = einops.rearrange( torch.arange(dp_world_size), "(outer_dp inner_dp) -> (inner_dp outer_dp)", - outer_dp=self.outer_fsdp_group.size(), - inner_dp=self.fsdp_group.size(), + outer_dp=self.get_outer_fsdp_group(is_expert_parallel).size(), + inner_dp=self.get_fsdp_group(is_expert_parallel).size(), ) - self._hybrid_fsdp_group_ranks = mesh.tolist() + setattr(self, _hybrid_fsdp_group_name, mesh.tolist()) # Find the index for the current rank in the hybrid group - return self._hybrid_fsdp_group_ranks.index(self.hybrid_fsdp_group.rank()) + return getattr(self, _hybrid_fsdp_group_name).index( + self.get_dp_group(is_expert_parallel).rank() + ) class GlobalMemoryBuffer: From 4a239726348eb43df6ea18c98fab122230bf7745 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Wed, 4 Feb 2026 12:24:27 +0800 Subject: [PATCH 050/231] [Main] Add the missing part to support 1F1B overlap for Qwen3-Next (#2997) --- megatron/core/ssm/gated_delta_net.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py index 70e749724dc..7b1149a781d 100644 --- a/megatron/core/ssm/gated_delta_net.py +++ b/megatron/core/ssm/gated_delta_net.py @@ -508,6 +508,19 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_gr return sharded_state_dict + def backward_dw(self): + """Execute weight gradient computation for all linear layers.""" + self._backward_in_proj() + self._backward_out_proj() + + def _backward_in_proj(self): + """Computes weight gradients of input projection layer.""" + self.in_proj.backward_dw() + + def _backward_out_proj(self): + """Computes weight gradients of output projection layer.""" + self.out_proj.backward_dw() + def _split_tensor_factory( orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int From c036e77bb5123e495e400b5009587c903ea44448 Mon Sep 17 00:00:00 2001 From: Parth Mannan <38387286+parthmannan@users.noreply.github.com> Date: Wed, 4 Feb 2026 00:01:26 -0800 Subject: [PATCH 051/231] Missing import fix (#3241) --- megatron/core/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/utils.py b/megatron/core/utils.py index dac1beb7075..467f4ff3cce 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -2082,8 +2082,8 @@ def get_thd_batch_on_this_cp_rank( max_seqlen_kv=int(max_seqlen[0].item()), ) - cp_size = get_context_parallel_world_size() if cp_size is None else cp_size - cp_rank = get_context_parallel_rank() if cp_rank is None else cp_rank + cp_size = parallel_state.get_context_parallel_world_size() if cp_size is None else cp_size + cp_rank = parallel_state.get_context_parallel_rank() if cp_rank is None else cp_rank if cp_size > 1: # slice batch along sequence dimension for context parallelism assert tex is not None and is_te_min_version("1.10.0"), ( "Please update Transformer Engine to >= 1.10 to use " From 43db8c1cf98f59d7c2fb84fbbb39c1ab05a68c6c Mon Sep 17 00:00:00 2001 From: Keshav Santhanam Date: Wed, 4 Feb 2026 03:31:50 -0800 Subject: [PATCH 052/231] Miscellaneous inference cleanup (Replay of !2955) (#3232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Keshav Santhanam Co-authored-by: oliver könig --- .../inference/gpt/gpt_dynamic_inference.py | 339 ++++------------ .../gpt_dynamic_inference_with_coordinator.py | 117 ++---- .../inference/gpt/gpt_static_inference.py | 71 +--- examples/inference/gpt/utils.py | 232 +++-------- examples/rl/README.md | 2 +- .../rl/model_configs/llama3p1_8b_instruct.sh | 3 +- examples/rl/model_configs/nemotron5_56b.sh | 2 +- examples/rl/model_configs/nemotron5_8b.sh | 2 +- .../rl/model_configs/nemotron5p5_12b_H.sh | 2 +- examples/rl/model_configs/nemotron6_3b_moe.sh | 2 +- .../rl/model_configs/qwen3_30b_a3b_moe.sh | 2 +- examples/rl/model_configs/qwen3_32b.sh | 2 +- examples/rl/model_configs/qwen3_4b.sh | 2 +- examples/rl/model_configs/qwen3_8b.sh | 2 +- examples/rl/model_configs/qwen_2p5_32b.sh | 2 +- examples/rl/model_configs/qwen_2p5_3b.sh | 2 +- .../rl/model_configs/qwen_2p5_distill_7b.sh | 2 +- examples/rl/model_configs/qwen_2p5_math_7b.sh | 2 +- megatron/core/inference/config.py | 186 +++++++++ .../attention_context/mamba_metadata.py | 26 +- .../core/inference/contexts/base_context.py | 8 +- .../inference/contexts/dynamic_context.py | 367 ++++++------------ .../core/inference/contexts/static_context.py | 14 +- .../core/inference/engines/dynamic_engine.py | 101 ++--- .../core/inference/engines/static_engine.py | 72 ++-- .../abstract_model_inference_wrapper.py | 220 +++-------- .../gpt/gpt_inference_wrapper.py | 23 +- .../inference_wrapper_config.py | 66 ---- .../t5/t5_inference_wrapper.py | 7 +- .../simple_text_generation_controller.py | 5 - .../text_generation_controller.py | 90 ++--- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/models/mamba/mamba_model.py | 4 +- megatron/core/ssm/mamba_layer.py | 5 +- .../core/transformer/transformer_config.py | 9 + megatron/core/utils.py | 70 ++-- megatron/inference/__init__.py | 1 + megatron/inference/utils.py | 320 +++++++++++++++ megatron/rl/inference/megatron.py | 167 +------- megatron/training/arguments.py | 18 +- .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + .../recipes/h100/gpt-static-inference.yaml | 6 +- .../recipes/h100/moe-static-inference.yaml | 6 +- .../contexts/test_dynamic_context.py | 122 ++++-- .../inference/engines/test_dynamic_engine.py | 71 +--- .../inference/engines/test_static_engine.py | 20 +- .../gpt/test_gpt_inference_wrapper.py | 61 +-- .../t5/test_t5_inference_wrapper.py | 19 +- .../test_model_inference_wrapper_config.py | 21 - .../inference/test_inference_config.py | 17 + .../inference/test_wandb_logging.py | 73 ++-- ...oder_decoder_text_generation_controller.py | 19 +- ....py => test_text_generation_controller.py} | 43 +- .../test_vlm_text_generation_controller.py | 19 +- tests/unit_tests/models/test_gpt_model.py | 21 +- .../models/test_gpt_model_batch_invariant.py | 80 ++-- tests/unit_tests/models/test_mamba_model.py | 29 +- .../unit_tests/models/test_mamba_moe_model.py | 2 + tools/run_dynamic_text_generation_server.py | 42 +- tools/run_inference_performance_test.py | 180 ++------- tools/run_text_generation_server.py | 27 +- train_rl.py | 3 + 68 files changed, 1372 insertions(+), 2087 deletions(-) create mode 100644 megatron/core/inference/config.py delete mode 100644 megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py delete mode 100644 megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py create mode 100644 megatron/inference/__init__.py create mode 100644 megatron/inference/utils.py delete mode 100644 tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py create mode 100644 tests/unit_tests/inference/test_inference_config.py rename tests/unit_tests/inference/text_generation_controllers/{test_simple_text_generation_controller.py => test_text_generation_controller.py} (96%) diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 88b744b3ac0..7fcac70c11a 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -1,40 +1,31 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# pylint: disable=bad-builtin + import hashlib import io import json -import math import os -import pickle import sys import warnings -import torch -from argparse import ArgumentParser from collections import defaultdict -from functools import partial +from typing import Dict, List, Optional + +import torch from tqdm import tqdm -from typing import Dict, List, Tuple, Optional sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import megatron from examples.inference.gpt.utils import ( Request, - add_common_inference_args, build_dynamic_engine_setup_prefix, build_requests, get_curr_time, get_global_peak_memory_stats_bytes, ) -from megatron.core.inference.contexts.dynamic_context import ( - ContextOverflowError, - DynamicInferenceContext, -) -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, @@ -44,194 +35,26 @@ TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -from megatron.training import get_args, get_model as _get_model, get_tokenizer, initialize_megatron -from megatron.training.checkpointing import load_checkpoint -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder +import logging +import megatron from megatron.core.utils import configure_nvtx_profiling -import logging +from megatron.training import get_args, get_tokenizer, initialize_megatron torch.serialization.add_safe_globals([io.BytesIO]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) -def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Dynamic inference arguments.""" - - add_common_inference_args(parser) - - group = parser.add_argument_group(title='Dynamic inference') - group.add_argument( - "--inference-ckpt-non-strict", - action="store_true", - help="Load checkpoint with `strict=False`.", - ) - group.add_argument( - "--termination-id", type=int, default=None, - help="Termination ID that overrides `tokenizer.eod`.", - ) - group.add_argument( - "--suspend-resume-interval", type=int, default=None, - help="Suspend and resume the dynamic engine every " - "`suspend_resume_interval` steps. This is used to tet the suspend/resume " - "system.", - ) - group.add_argument( - "--inference-repeat-n", type=int, default=1, - help="Repeat inference iterations N times for benchmarking." - ) - group.add_argument( - "--throughput-check-only", - action='store_true', - default=False, - help="If true, only run throughput check without verifying outputs." - ) - - return parser - - -def get_model() -> MegatronModule: - """Initialize model and load checkpoint.""" - - args = get_args() - - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - - # Build model. - model = _get_model( - partial(model_provider, model_builder), - wrap_with_ddp=False - ) - - # Load checkpoint. - assert args.load is not None - args.exit_on_missing_checkpoint = True - load_checkpoint( - ddp_model=model, - optimizer=None, - opt_param_scheduler=None, - strict=not args.inference_ckpt_non_strict, - ) - - # No virtual PP. - assert len(model) == 1, "Above condition should have caught this" - model = model[0] - - # Eval mode. - model.eval() - - return model - - -def get_inference_context( - requests: List[Request], - sampling_params: Optional[SamplingParams] = None, - calculate_max_sequence_length_from_requests: bool = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, -): - """The inference context manages the KV cache and other inference state.""" - - args = get_args() - - # Max sequence length. - if calculate_max_sequence_length_from_requests: - max_gen_length = sampling_params.num_tokens_to_generate - max_context_length = max(len(r.prompt_tokens) for r in requests) - max_sequence_length = max_context_length + max_gen_length - else: - max_sequence_length = args.inference_max_seq_length - - metrics_writer = None - if args.inference_logging_step_interval > 0 and args.inference_wandb_logging: - metrics_writer = get_wandb_writer() - - # Inference context. - context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers // args.pipeline_model_parallel_size, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=max_sequence_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, - metrics_writer=metrics_writer, - offload_kv_cache=args.rl_offload_kv_cache_during_training - ) - - return context - - -def get_inference_controller( - model: MegatronModule, context: DynamicInferenceContext -) -> TextGenerationController: - """Buid text generation controller, which manages the model inference context. - - Args: - model (MegatronModule): Megatron GPT model. - context (DynamicInferenceContext): Context for managing KV cache blocks. - - Return: - (TextGenerationController) Inference text generation controller. - """ - - args = get_args() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - # Wrap model in inference wrapper. - model = GPTInferenceWrapper(model, args, context) - - # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). - from megatron.core import parallel_state - - model.model_is_pipeline_parallel = not ( - parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() - ) - - # Text generation controller. - controller = TextGenerationController(model, tokenizer) - - return controller - - def run_inference( requests: List[Request], engine: DynamicInferenceEngine, @@ -284,11 +107,7 @@ def _add_request(): """ nonlocal num_requests_added _request = requests[num_requests_added] - engine.add_request( - num_requests_added, - _request.prompt_text, - _request.sampling_params, - ) + engine.add_request(num_requests_added, _request.prompt_text, _request.sampling_params) _request.time_start = get_curr_time() _request.state = "started" num_requests_added += 1 @@ -305,10 +124,9 @@ def _add_request(): _add_request() else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added, - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): _add_request() add_times.append(get_curr_time() - add_start) @@ -318,11 +136,12 @@ def _add_request(): result = engine.step_modern() except EngineSuspendedError as e: result = e - pass # ignore error in order to call 'engine.resume()' below. + pass # ignore error in order to call 'engine.resume()' below. attempted_step_count += 1 - # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine - is_decode_only = engine.is_decode_only + # After step, we lost track of last iteration's is_decode_only, + # so we need to get it from the engine + is_decode_only = engine.is_decode_only # Test suspending and resuming engine. if args.suspend_resume_interval is not None: @@ -335,9 +154,9 @@ def _add_request(): # Resume, 0+ attempted steps later. if ( attempted_step_count > 0 - and - (attempted_step_count - args.suspend_resume_interval // 2) - % args.suspend_resume_interval == 0 + and (attempted_step_count - args.suspend_resume_interval // 2) + % args.suspend_resume_interval + == 0 ): print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count)) engine.resume() @@ -349,7 +168,9 @@ def _add_request(): # Record cuda_graph_request_count. cuda_graph_request_count = result["cuda_graph_request_count"] if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None: - cuda_graph_request_count_map[cuda_graph_request_count] = cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + cuda_graph_request_count_map[cuda_graph_request_count] = ( + cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + ) # Update requests. active_request_ids = result["active_request_ids"] @@ -408,29 +229,29 @@ def _add_request(): engine.resume() return { - "step_times" : step_times, - "add_times" : add_times, - "output_times" : output_times, - "total_output_tokens" : total_output_tokens, - "cuda_graph_request_count_map" : cuda_graph_request_count_map, + "step_times": step_times, + "add_times": add_times, + "output_times": output_times, + "total_output_tokens": total_output_tokens, + "cuda_graph_request_count_map": cuda_graph_request_count_map, } @torch.inference_mode() def main(): - + """Run dynamic inference.""" # Initialize Megatron. initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) # Start Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - - level_str = os.getenv("LOG_LEVEL", "INFO").upper() - level = getattr(logging, level_str, logging.INFO) + + level_str = os.getenv("LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_str, logging.INFO) logging.basicConfig(level=level, force=True) configure_nvtx_profiling(True) @@ -456,42 +277,36 @@ def main(): termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, top_n_logprobs=args.top_n_logprobs, stop_words=args.stop_words, - ) - - model = get_model() + ) - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() # Requests, context, controller. requests = build_requests(args, tokenizer, sampling_params) - context = get_inference_context( - requests, - sampling_params, - mamba_inference_state_config=mamba_inference_state_config, - ) - controller = get_inference_controller(model, context) + inference_config = get_inference_config_from_model_and_args(model, args) + + # Calculate max_sequence_length from requests + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + inference_config.max_sequence_length = max_context_length + max_gen_length + context = DynamicInferenceContext(model.config, inference_config) + wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(wrapped_model, tokenizer) # Validate all context_length's <= max_tokens. - if args.disable_chunked_prefill: + if not args.enable_chunked_prefill: invalid_prompt_length_map = {} for request_idx, request in enumerate(requests): if len(request.prompt_tokens) > context.max_tokens: invalid_prompt_length_map[request_idx] = len(request.prompt_tokens) - assert not invalid_prompt_length_map, ( - "request idxs with prompts longer than context.max_tokens: " - ", ".join(f"{k}({v})" for k, v in invalid_prompt_length_map.items()) + assert ( + not invalid_prompt_length_map + ), "request idxs with prompts longer than context.max_tokens: " ", ".join( + f"{k}({v})" for k, v in invalid_prompt_length_map.items() ) # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = DynamicInferenceEngine(controller, context) setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") @@ -522,14 +337,13 @@ def main(): # Validate all requests finished. for request in requests: - assert request.state == "finished", ( - f"request.state == '{request.state}' != 'finished'." - ) + assert request.state == "finished", f"request.state == '{request.state}' != 'finished'." peak_mem_stats = get_global_peak_memory_stats_bytes() # Print unique prompts + outputs. if torch.distributed.get_rank() == 0: + def escape_str(s): return s.replace("\n", "\\n") @@ -547,7 +361,10 @@ def escape_str(s): # ---- Prompt summary line ---- prompt_len = len(requests[request_idxs[0]].prompt_tokens) escaped_prompt_text = escape_str(prompt_text) - print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}") + print( + f"\n{unique_idx+1}/{len(unique_prompt_map)}" + f"[n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}" + ) # ---- Group all outputs for this prompt ---- output_map = defaultdict(list) @@ -567,16 +384,17 @@ def escape_str(s): # Use hash of prompt + generated text in case engine was # suspended and resumed, which misaligns boundary between # prompt and generated tokens. - o_hash = hashlib.sha256( - (prompt_text + output_text).encode() - ).hexdigest()[:6] + o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] o_len = len(requests[output_request_idxs[0]].output_tokens) escaped_output_text = escape_str(output_text) else: o_hash = "--" o_len = 0 escaped_output_text = "--" - print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}{', ' if evicted else ''}] {escaped_output_text}") + print( + f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" + f"{', ' if evicted else ''}] {escaped_output_text}" + ) text_hashes.append(o_hash) # Write results to JSON. Primarily used for functional testing. @@ -592,14 +410,16 @@ def escape_str(s): "generated_text": req.output_text, "generated_tokens": req.output_tokens, "latency": req.time_end - req.time_start, - "cuda_graph_request_count_map" : result["cuda_graph_request_count_map"], - "step_count" : engine.step_count, - "top_n_logprobs" : getattr(req, 'generated_top_n_logprobs', None), - "prompt_top_n_logprobs" : getattr(req, 'prompt_top_n_logprobs', None), + "cuda_graph_request_count_map": result["cuda_graph_request_count_map"], + "step_count": engine.step_count, + "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), } if req.sampling_params.return_log_probs: result_dict["prompt_logprobs"] = getattr(req, 'prompt_log_probs', None) - result_dict["generated_logprobs"] = getattr(req, 'generated_log_probs', None) + result_dict["generated_logprobs"] = getattr( + req, 'generated_log_probs', None + ) result_dict["logprobs"] = getattr(req, 'logprobs', None) json_results[req.request_id] = result_dict @@ -631,7 +451,7 @@ def escape_str(s): d_count = len(d_times) p_mean = p_total / p_count - d_mean = d_total / d_count if d_count != 0 else 0. + d_mean = d_total / d_count if d_count != 0 else 0.0 # Commented out for now as the step/add/output times are not calculated correctly. # print( @@ -643,18 +463,13 @@ def escape_str(s): # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " # f"count [ p {p_count}, d {d_count} ]." # ) - capture_str = ( - f"{engine.capture_stats['time']:.2f} sec" - if engine.capture_stats else - "--" - ) + capture_str = f"{engine.capture_stats['time']:.2f} sec" if engine.capture_stats else "--" print( - f"{setup_prefix} … " - f"throughput: {throughput:.3f} tok/s … ", + f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", f"total time: {total_time:.3f}s … " f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " f"steps: {engine.step_count:d} … " - f"capture {capture_str}" + f"capture {capture_str}", ) print("~~~") diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index cbb7a1aa745..ab84ee5bf5c 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -2,43 +2,33 @@ import asyncio import json +import logging import os import time -import torch -import torch.distributed as dist +import warnings from collections import defaultdict -from tqdm import tqdm from typing import List -import warnings -import logging -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) -from examples.inference.gpt.utils import ( - Request, - build_dynamic_engine_setup_prefix, - build_requests, - add_common_inference_args -) +import torch +import torch.distributed as dist -from megatron.core import parallel_state +from examples.inference.gpt.utils import Request, build_dynamic_engine_setup_prefix, build_requests from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.inference_request import DynamicInferenceRequestRecord from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.utils import get_mamba_inference_state_config_from_model - +from megatron.inference.utils import ( + add_inference_args, + get_dynamic_inference_engine, + get_model_for_inference, +) from megatron.training import get_args, get_tokenizer, initialize_megatron -from megatron.training.arguments import parse_args # pylint: disable=line-too-long logging.basicConfig(level=logging.INFO, force=True) + async def main( engine: DynamicInferenceEngine, requests: List[Request], @@ -51,12 +41,11 @@ async def main( "Sampling parameters are specified per request.", DeprecationWarning, ) - + # once you call engine.start_listening_to_data_parallel_coordinator, # the engine will start accepting requests from the data parallel coordinator. # and processing them in an asyncio coroutine. # leaving inference_coordinator_port as None will find a free port automatically. - dp_addr = await engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=port, launch_inference_coordinator=True, @@ -69,14 +58,11 @@ async def main( # Since the client doesn't directly call engine.async_step here, we test # the suspend-resume system ~4 times. suspend_resume_interval = max(1, len(requests) // 4) - suspend_idxs = set(range( - suspend_resume_interval, - len(requests) + 1, - suspend_resume_interval, - )) + suspend_idxs = set( + range(suspend_resume_interval, len(requests) + 1, suspend_resume_interval) + ) resume_idxs = set( - min(len(requests), i + suspend_resume_interval // 2) - for i in suspend_idxs + min(len(requests), i + suspend_resume_interval // 2) for i in suspend_idxs ) else: suspend_idxs = set() @@ -98,7 +84,10 @@ async def main( current_time = time.time_ns() / 10**9 if args.incoming_requests_per_step is None: # Only add requests that have arrived at the current time. - while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: + while ( + num_requests_added < num_requests_total + and requests[num_requests_added].time_arrival <= current_time + ): request = requests[num_requests_added] # These add-request calls will queue up the request on a zmq socket and return # instantaneously. They will return an asyncio future which can be awaited for @@ -114,10 +103,9 @@ async def main( else: # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added - )): + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): # Change sampling parameters to force different generation lengths. request = requests[num_requests_added] n = request.sampling_params.num_tokens_to_generate @@ -135,7 +123,7 @@ async def main( break # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. await asyncio.sleep(0) - + # While we wait for the requests to complete, the engine runs in the background. results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) @@ -170,16 +158,19 @@ async def main( req = record.merge() unique_prompt_map[req.prompt].append(req) for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): - print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % ( - idx, - len(unique_prompt_map), - prompt_text.replace("\n", "\\n"), - len(reqs), - reqs[0].generated_text.replace("\n", "\\n"), - )) + print( + f"%d/%d. prompt '%s' ... [%d] output '%s'." + % ( + idx, + len(unique_prompt_map), + prompt_text.replace("\n", "\\n"), + len(reqs), + reqs[0].generated_text.replace("\n", "\\n"), + ) + ) # kill the engines and suspend the client - # Right now, we can only call stop when all requests are done. + # Right now, we can only call stop when all requests are done. # Todo: Make this explicit in the Client class.... await client.stop_engines() client.stop() @@ -190,11 +181,11 @@ async def main( if __name__ == "__main__": - # enable inference mode in the very beginning as some fp-8 optimizations + # enable inference mode in the very beginning as some fp8 optimizations # check for it. with torch.inference_mode(): initialize_megatron( - extra_args_provider=add_dynamic_inference_args, + extra_args_provider=add_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) @@ -213,34 +204,16 @@ async def main( ), ) - # Requests, context, conroller. - model = get_model() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + model = get_model_for_inference() + requests = ( build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None ) - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) - - controller = get_inference_controller(model, context) - - # Inference engine. - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=args.inference_logging_step_interval, - ) + engine = get_dynamic_inference_engine(model=model) if dist.get_rank() == 0: - setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) + setup_prefix = build_dynamic_engine_setup_prefix(args, model, engine.context, requests) print("~~~") print(setup_prefix) print("~~~") @@ -249,13 +222,7 @@ async def main( if os.environ.get("NSIGHT_PREFIX"): torch.cuda.cudart().cudaProfilerStart() - asyncio.run( - main( - engine, - requests, - args.inference_coordinator_port, - ) - ) + asyncio.run(main(engine, requests, args.inference_coordinator_port)) # Stop Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index 03a60927ab2..298ebfebd86 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -1,21 +1,11 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import os -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) -from model_provider import model_provider -from gpt_builders import gpt_builder -from mamba_builders import mamba_builder -import torch import sys import time -import warnings -from functools import partial from argparse import Namespace import torch -import tqdm from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine @@ -23,17 +13,12 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule -from pretrain_gpt import model_provider as gpt_model_provider -from pretrain_mamba import model_provider as mamba_model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) @@ -41,18 +26,18 @@ import asyncio import json -from typing import Any, AsyncIterator, List +from typing import List -from examples.inference.gpt.utils import add_common_inference_args, build_requests -from megatron.core import mpu -from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 -from megatron.training.checkpointing import load_checkpoint +from examples.inference.gpt.utils import build_requests +from megatron.inference.utils import add_inference_args, get_model_for_inference +from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron.training.initialize import initialize_megatron + def add_static_inference_args(parser): """Static inference arguments.""" - add_common_inference_args(parser) + add_inference_args(parser) group = parser.add_argument_group(title='Static inference') group.add_argument( @@ -83,30 +68,16 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere tokenizer = get_tokenizer() else: tokenizer = build_tokenizer(args) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_requests=args.inference_max_batch_size, - inference_max_seq_length=args.inference_max_seq_length, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - fp8=args.fp8, - moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context + inference_context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_seq_length ) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) engine_kwargs = { - "text_generation_controller" : text_generation_controller, - "legacy" : args.use_legacy_static_engine, + "text_generation_controller": text_generation_controller, + "legacy": args.use_legacy_static_engine, } if not args.use_legacy_static_engine: engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb @@ -165,22 +136,7 @@ def main(): args = get_args() - if args.max_batch_size is not None: - warnings.warn( - f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`." - ) - args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size) - - # Set up model and load checkpoint - if args.model_provider == "gpt": - model_builder = gpt_builder - elif args.model_provider == "mamba": - model_builder = mamba_builder - else: - raise ValueError(f"Invalid model provider {args.model_provider}") - model = get_model(partial(model_provider, model_builder), wrap_with_ddp=False) - load_checkpoint(model, None, None, strict=False) - model = model[0] + model = get_model_for_inference() inference_engine = get_inference_engine(args, model) @@ -276,7 +232,7 @@ def main(): ) ), len(requests), - args.inference_max_batch_size, + args.inference_max_requests, stats["allocated_bytes.all.peak"] / (1024**3), stats["reserved_bytes.all.peak"] / (1024**3), latency, @@ -293,6 +249,5 @@ def main(): torch.distributed.destroy_process_group() - if __name__ == "__main__": main() diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py index a04b856c0a6..b7a3977605c 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/gpt/utils.py @@ -1,158 +1,23 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy -import json import itertools +import json import random import time -import torch from argparse import ArgumentParser, Namespace -from tqdm import tqdm +from functools import partial from typing import Any, List, Optional -from megatron.core.inference.inference_request import DynamicInferenceRequest +import torch +from tqdm import tqdm + from megatron.core.inference.contexts import DynamicInferenceContext from megatron.core.inference.contexts.dynamic_context import get_mem_size_str -from megatron.core.transformer.module import MegatronModule - +from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams - - -def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: - """Common inference arguments.""" - - group = parser.add_argument_group(title='Common inference') - - group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') - group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') - group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') - group.add_argument( - "--return-log-probs", - action='store_true', - default=False, - help='Return the log probabilities of the final output tokens', - ) - group.add_argument( - "--prompts", - metavar='N', - type=str, - nargs='+', - help='Input prompts with each prompt within quotes and seperated by space', - ) - group.add_argument( - "--num-tokens-to-prompt", - type=int, - nargs="+", - default=[64, 1024], - help='Number of tokens to use for simulated prompts. This should be a ' - 'space-separated pair of integers, and the generated prompt lengths will ' - 'be uniformly sampled within this range.', - ) - group.add_argument( - "--num-tokens-to-generate", - type=int, - default=30, - help='Number of tokens to generate for each prompt', - ) - group.add_argument( - "--num-tokens-from-file", - action='store_true', - default=False, - help='Use per-prompt num_tokens_to_generate from prompt file', - ) - group.add_argument( - "--top-n-logprobs", - type=int, - default=0, - help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary', - ) - group.add_argument( - "--incoming-requests-per-step", - type=int, default=None, - help="Add a deterministic number of requests per step. This arg is " - "prioritized over `--incoming-requests-per-sec` below (which is non-" - "deterministic). Note that the number of requests added per step is " - "additionally limited by the inference context's `max_requests`, " - "`max_tokens`, and KV buffer size.", - ) - group.add_argument( - "--incoming-requests-per-sec", - type=float, - default=100.0, - help="Simulated number of requests per second. Set to -1 to add all requests together.", - ) - group.add_argument( - "--incoming-requests-duration", - type=float, - default=10.0, - help="Total amount of time to simulate that requests are " - "arriving. Multiply this value with " - "`--incoming-requests-per-sec` to get the approximate " - "total number of requests. Set to -1 to add all requests together.", - ) - group.add_argument( - "--model-provider", - choices=["mamba", "gpt"], - default="gpt", - help="Model provider", - ) - group.add_argument( - "--skip-prompt-log-probs", - action='store_true', - default=False, - help='Skip prompt log probs.', - ) - group.add_argument( - "--stop-words", - metavar='WORD', - type=str, - nargs='+', - default=None, - help='Stop words to terminate generation. Each word should be quoted and ' - 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', - ) - group.add_argument( - "--output-path", - type=str, - default=None, - help="Path to save generations as JSON", - ) - group.add_argument( - "--output-every-n-results", - type=int, - default=1, - help="To minimize the output file size of larger runs, only write the " - "results of every `n` requests.", - ) - group.add_argument( - "--prompt-file", - help='Jsonl file containing input prompts, where each item (i.e., line) ' - 'contains the field \'text\' where the value is the prompt. All other ' - 'fields within each item are ignored, and may be customized for each ' - 'application.', - ) - group.add_argument( - "--prompt-file-num-truncate", - type=int, - help='Number of samples to use from the loaded prompt file (see ' - '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' - 'will be used, in order.', - ) - group.add_argument( - "--use-flashinfer-fused-rope", - action='store_true', - default=False, - help='Use flashinfer fused rope implementation.', - ) - group.add_argument( - "--no-record-throughput", - action='store_false', - dest="record_throughput", - help="Disable throughput recording in --output-file" - - ) - - return parser +from megatron.core.transformer.module import MegatronModule +from megatron.training import get_args def get_default_sampling_params(termination_id: int = None): @@ -162,9 +27,10 @@ def get_default_sampling_params(termination_id: int = None): top_p=0.0, return_log_probs=False, num_tokens_to_generate=30, - termination_id = termination_id, + termination_id=termination_id, ) + def get_curr_time() -> float: """Get synchronized time across ranks.""" curr_time = torch.cuda.LongTensor([time.time_ns()]) @@ -188,7 +54,13 @@ class Request: tokenizer (Any): Tokenizer for tokenizing the prompt. """ - def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, sampling_params: SamplingParams = None): + def __init__( + self, + prompt_text: str, + time_offset: float, + tokenizer: Any, + sampling_params: SamplingParams = None, + ): self.prompt_text = prompt_text self.prompt_tokens = tokenizer.tokenize(prompt_text) self.output_text = None @@ -198,7 +70,11 @@ def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, samplin self.time_start = None self.time_end = None self.state = "not-started" - self.sampling_params: SamplingParams = sampling_params if sampling_params is not None else get_default_sampling_params(tokenizer.eod) + self.sampling_params: SamplingParams = ( + sampling_params + if sampling_params is not None + else get_default_sampling_params(tokenizer.eod) + ) self.sampling_params = copy.deepcopy(self.sampling_params) def __str__(self) -> str: @@ -225,10 +101,10 @@ def get_time_offsets( # if num_requests is not None: incoming_requests_duration = num_requests / incoming_requests_per_sec - incoming_requests_duration *= 2 # extra margin, to accomodate time sampling + incoming_requests_duration *= 2 # extra margin, to accomodate time sampling random.seed(seed) - + import simpy # Guard against this import in test case # Generate random time offsets. @@ -241,14 +117,14 @@ def arrival(r): env = simpy.Environment() env.process(arrival(incoming_requests_per_sec)) env.run(incoming_requests_duration) - + # Ensure at least a single request. if len(time_offsets) == 0: time_offsets = [0.0] # Ensure first time is 0. time_offsets = [to - time_offsets[0] for to in time_offsets] - + # Truncate to num_requests. assert len(time_offsets) >= num_requests time_offsets = time_offsets[:num_requests] @@ -257,7 +133,7 @@ def arrival(r): def get_cli_requests( - args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None ) -> list[Request]: # Get time offsets. @@ -269,7 +145,7 @@ def get_cli_requests( ) # Init requests. - requests = [Request(p, t, tokenizer, sampling_params) for p,t in zip(args.prompts, t_offsets)] + requests = [Request(p, t, tokenizer, sampling_params) for p, t in zip(args.prompts, t_offsets)] return requests @@ -289,18 +165,14 @@ def get_synthetic_requests( # Build prompts with expected lengths. assert ( len(args.num_tokens_to_prompt) == 2 - and - args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] + and args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] ) max_prompt_length = args.num_tokens_to_prompt[1] max_prompt_text = "hi " * max_prompt_length max_prompt_tokens = tokenizer.tokenize(max_prompt_text) - prompt_lengths = [ - random.randint(*args.num_tokens_to_prompt) - for _ in time_offsets - ] - prompt_tokens_list = [ max_prompt_tokens[:l] for l in prompt_lengths ] - prompt_texts = [ tokenizer.detokenize(tt) for tt in prompt_tokens_list ] + prompt_lengths = [random.randint(*args.num_tokens_to_prompt) for _ in time_offsets] + prompt_tokens_list = [max_prompt_tokens[:l] for l in prompt_lengths] + prompt_texts = [tokenizer.detokenize(tt) for tt in prompt_tokens_list] # Init requests. assert len(prompt_texts) == len(time_offsets) @@ -340,16 +212,15 @@ def get_requests_from_file( # Get time offsets. time_offsets: list[float] = get_time_offsets( - args.seed, - args.incoming_requests_per_step, - args.incoming_requests_per_sec, - len(prompts), + args.seed, args.incoming_requests_per_step, args.incoming_requests_per_sec, len(prompts) ) # Init requests. requests = [ Request(p, t, tokenizer, sp) - for p, t, sp in tqdm(zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts)) + for p, t, sp in tqdm( + zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts) + ) ] return requests @@ -411,19 +282,21 @@ def build_dynamic_engine_setup_prefix( # Prompt description prompt_src_str = ( - "cli" if args.prompts else - "file" if args.prompt_file else - f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + "cli" + if args.prompts + else ( + "file" + if args.prompt_file + else f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + ) ) request_str = ( - f"requests: {prompt_src_str}, " - f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " + f"requests: {prompt_src_str}, " f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " ) request_str += ( - f"dur {args.incoming_requests_duration:.1e} " - f"r/sec {args.incoming_requests_per_sec:.1e}" - if args.incoming_requests_per_step is None else - f"r/step {args.incoming_requests_per_step}" + f"dur {args.incoming_requests_duration:.1e} " f"r/sec {args.incoming_requests_per_sec:.1e}" + if args.incoming_requests_per_step is None + else f"r/step {args.incoming_requests_per_step}" ) # Buffer limits config @@ -433,14 +306,7 @@ def build_dynamic_engine_setup_prefix( f"[r {context.max_requests}, t {context.max_tokens}]" ) - parts = [ - get_model_size_str(model), - "dynamic", - cg_str, - uvm_str, - request_str, - buffer_limits_str, - ] + parts = [get_model_size_str(model), "dynamic", cg_str, uvm_str, request_str, buffer_limits_str] return " | ".join(parts) @@ -456,4 +322,4 @@ def get_global_peak_memory_stats_bytes() -> dict: t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64) torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) peak_alloc = int(t[0].item()) - return {"mem-max-allocated-bytes": peak_alloc} \ No newline at end of file + return {"mem-max-allocated-bytes": peak_alloc} diff --git a/examples/rl/README.md b/examples/rl/README.md index 34b6fafa517..9c2de3ec088 100644 --- a/examples/rl/README.md +++ b/examples/rl/README.md @@ -94,7 +94,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/llama3p1_8b_instruct.sh b/examples/rl/model_configs/llama3p1_8b_instruct.sh index 24d285a6cf7..5398dad1a4e 100644 --- a/examples/rl/model_configs/llama3p1_8b_instruct.sh +++ b/examples/rl/model_configs/llama3p1_8b_instruct.sh @@ -77,7 +77,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --add-qkv-bias \ --normalization RMSNorm \ @@ -101,6 +101,7 @@ MODEL_OPTIONS="\ --max-position-embeddings 131072 \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model unsloth/Meta-Llama-3.1-8B-Instruct \ + --legacy-tokenizer \ --langrl-inference-server-type "inplace_megatron_chat" \ --langrl-inference-server-conversation-template "unsloth/Meta-Llama-3.1-8B-Instruct" \ --lr 3e-7 \ diff --git a/examples/rl/model_configs/nemotron5_56b.sh b/examples/rl/model_configs/nemotron5_56b.sh index fd2cc4f7212..741cd054b73 100644 --- a/examples/rl/model_configs/nemotron5_56b.sh +++ b/examples/rl/model_configs/nemotron5_56b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-format hybrid \ --fp8-amax-history-len 1 \ diff --git a/examples/rl/model_configs/nemotron5_8b.sh b/examples/rl/model_configs/nemotron5_8b.sh index 7b8947ae763..753d4e493a2 100644 --- a/examples/rl/model_configs/nemotron5_8b.sh +++ b/examples/rl/model_configs/nemotron5_8b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \ --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ diff --git a/examples/rl/model_configs/nemotron5p5_12b_H.sh b/examples/rl/model_configs/nemotron5p5_12b_H.sh index 9e97051e087..adbcc8d03f0 100644 --- a/examples/rl/model_configs/nemotron5p5_12b_H.sh +++ b/examples/rl/model_configs/nemotron5p5_12b_H.sh @@ -65,7 +65,7 @@ MODEL_OPTIONS="\ --calculate-per-token-loss \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --fp8-recipe blockwise \ --fp8-format e4m3 \ diff --git a/examples/rl/model_configs/nemotron6_3b_moe.sh b/examples/rl/model_configs/nemotron6_3b_moe.sh index eff4f6cf0b3..7d98f4eda63 100644 --- a/examples/rl/model_configs/nemotron6_3b_moe.sh +++ b/examples/rl/model_configs/nemotron6_3b_moe.sh @@ -85,7 +85,7 @@ MODEL_OPTIONS="\ --rl-importance-sampling-truncation-coef 10.0 \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --distributed-timeout-minutes 60 \ --use-mcore-models \ diff --git a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh index 775a9587ba4..eb55ba35cc6 100644 --- a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh +++ b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh @@ -37,7 +37,7 @@ ENV_DEPENDENT="\ MODEL_OPTIONS=" --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ ---inference-max-batch-size $MAX_INFERENCE_BS \ +--inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --no-use-tokenizer-model-from-checkpoint-args \ --seq-length 8192 \ diff --git a/examples/rl/model_configs/qwen3_32b.sh b/examples/rl/model_configs/qwen3_32b.sh index cd153a04f3c..c06c5f55b53 100644 --- a/examples/rl/model_configs/qwen3_32b.sh +++ b/examples/rl/model_configs/qwen3_32b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 64 \ diff --git a/examples/rl/model_configs/qwen3_4b.sh b/examples/rl/model_configs/qwen3_4b.sh index da238511fd3..6f6c6b6bf57 100644 --- a/examples/rl/model_configs/qwen3_4b.sh +++ b/examples/rl/model_configs/qwen3_4b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --num-layers 36 \ --hidden-size 2560 \ diff --git a/examples/rl/model_configs/qwen3_8b.sh b/examples/rl/model_configs/qwen3_8b.sh index 6758cd84c3d..54ff7385331 100644 --- a/examples/rl/model_configs/qwen3_8b.sh +++ b/examples/rl/model_configs/qwen3_8b.sh @@ -38,7 +38,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --num-layers 36 \ diff --git a/examples/rl/model_configs/qwen_2p5_32b.sh b/examples/rl/model_configs/qwen_2p5_32b.sh index d82972ba477..2a2a9ae2420 100644 --- a/examples/rl/model_configs/qwen_2p5_32b.sh +++ b/examples/rl/model_configs/qwen_2p5_32b.sh @@ -59,7 +59,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_3b.sh b/examples/rl/model_configs/qwen_2p5_3b.sh index 246afae6ad2..f3250f39ecc 100644 --- a/examples/rl/model_configs/qwen_2p5_3b.sh +++ b/examples/rl/model_configs/qwen_2p5_3b.sh @@ -62,7 +62,7 @@ MODEL_OPTIONS="\ --ckpt-format torch_dist \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --disable-bias-linear \ --add-qkv-bias \ diff --git a/examples/rl/model_configs/qwen_2p5_distill_7b.sh b/examples/rl/model_configs/qwen_2p5_distill_7b.sh index 149ac77965f..1438bca0726 100644 --- a/examples/rl/model_configs/qwen_2p5_distill_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_distill_7b.sh @@ -44,7 +44,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/examples/rl/model_configs/qwen_2p5_math_7b.sh b/examples/rl/model_configs/qwen_2p5_math_7b.sh index 1d631fa80a5..b598bb127bd 100644 --- a/examples/rl/model_configs/qwen_2p5_math_7b.sh +++ b/examples/rl/model_configs/qwen_2p5_math_7b.sh @@ -58,7 +58,7 @@ MODEL_OPTIONS="\ --ckpt-format torch \ --seq-length $MAX_SEQ_LENGTH \ --inference-max-seq-length $MAX_SEQ_LENGTH \ - --inference-max-batch-size $MAX_INFERENCE_BS \ + --inference-max-requests $MAX_INFERENCE_BS \ --pretrained-checkpoint $CHECKPOINT \ --untie-embeddings-and-output-weights \ --disable-bias-linear \ diff --git a/megatron/core/inference/config.py b/megatron/core/inference/config.py new file mode 100644 index 00000000000..5970b4f14f6 --- /dev/null +++ b/megatron/core/inference/config.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import torch + +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_attr_wrapped_model + + +@dataclass +class MambaInferenceStateConfig: + """ + Config for initializing Mamba model inference state tensors. + + Note that we maintain separate metadata for decode, regular prefill, and + chunked prefill requests because the Mamba kernels do not yet support mixing + these. Once the kernels have been updated we can simplify this code. + """ + + layer_type_list: List[str] + """ + A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. + See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. + """ + + mamba_conv_states_shape: Tuple[int] + """Mamba conv states shape per request.""" + + mamba_ssm_states_shape: Tuple[int] + """Mamba ssm states shape per request.""" + + @classmethod + def from_model(cls, model: MegatronModule) -> Optional["MambaInferenceStateConfig"]: + """Returns Mamba inference state config from the model if it is a hybrid model.""" + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols + + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = ( + decoder.mamba_state_shapes_per_request() + ) + return cls( + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, + ) + return None + + +@dataclass +class InferenceConfig: + """ + Config for inference. + + NOTE: Must remain mutually exclusive with the `TransformerConfig`. + """ + + # ================================= + # KV cache config + # ================================= + block_size_tokens: int = 256 + """Size of KV cache block size.""" + + buffer_size_gb: int = 20 + """ + Buffer size reserved on the GPU for the KV cache. + If `unified_memory_level` >= 1, then CPU memory is additionally utilized, resulting in a total + buffer size of `buffer_size_gb + paused_buffer_size_gb`. + """ + + paused_buffer_size_gb: Optional[int] = None + """ + Portion of buffer reserved for paused requests. Active requests are paused when there are not + enough active blocks available to continue generating a request. The total buffer size + (active + paused) depends on `unified_memory_level` (uvm): + - uvm 0: buffer_size_gb (paused buffer is inclusive) + - uvm 1: buffer_size_gb + paused_buffer_size_gb + """ + + max_requests: Optional[int] = None + """ + Max number of active requests to use for decode-only forward passes. + This is primarily limited by the combination of `buffer_size_gb` and `max_sequence_length`. + """ + + max_tokens: Optional[int] = None + """ + Max number of tokens to use for forward passes. This is primarily limited by prefill activation + memory usage. (Defaults to 16384). + """ + + unified_memory_level: int = 0 + """ + Sets unified memory usage within the dynamic inference context. + The levels are: + 0) no unified memory (default) + 1) allocate `memory_buffer` in unified memory. + Eventually, additional levels will be included to control other tensors within the context. + """ + + offload_kv_cache: bool = False + """If True, offload KV cache during RL training.""" + + # ================================= + # CUDA graph config + # ================================= + num_cuda_graphs: Optional[int] = None + """ + Maximum number of cuda graphs to capture, where the cuda graph batch sizes range from 1 to + `max_requests`. Due to rounding, the actual number of cuda graphs may not equal this argument. + """ + + cuda_graph_mixed_prefill_count: Optional[int] = 16 + """ + The number of mixed prefill graphs to capture if mixed prefill/decode graphs are enabled. + """ + + use_cuda_graphs_for_non_decode_steps: bool = True + """ + Whether to use CUDA graphs for non-decode steps. + """ + + persist_cuda_graphs: bool = False + """ + Whether to persist CUDA graphs when the engine is suspended. + If False and `unified_memory_level` is 0, CUDA graphs are deleted on `suspend()` + and re-captured on `resume()` to save memory. + """ + + # ================================= + # Model config + # ================================= + max_sequence_length: int = 2560 + """Max possible sequence length (prompt + output) that will occur.""" + + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None + """The Mamba inference state config if the model is a hybrid model.""" + + pg_collection: Optional[ProcessGroupCollection] = None + """A `ProcessGroupCollection` for distributed execution.""" + + use_flashinfer_fused_rope: Optional[bool] = False + """ + If True, use flashinfer's fused rope implementation. + If None, defaults to using flash-infer if available. + """ + + materialize_only_last_token_logits: bool = True + """ + Whether to only materialize logits for the last token. This should be set to False + if returning log probs. + """ + + # ================================= + # Engine config + # ================================= + enable_chunked_prefill: bool = False + """Whether to enable chunked prefill.""" + + # ================================= + # Logging config + # ================================= + track_paused_request_events: bool = False + """ + Whether to track paused request events. If True, `add_event_pause()` is called on + requests when they are paused during bookkeeping. + """ + + metrics_writer: Optional["WandbModule"] = None + """Wandb module for writing metrics.""" + + logging_step_interval: int = 0 + """ + The step interval at which to log inference metrics to wandb. + Defaults to 0, which means no logging. + """ + + request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None + """ + A list of the per-request metadata types to track. Each entry is a tuple + consisting of the string label, the target dtype, and whether to store the data on GPU. + """ diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py index 6cf45aeb9e1..13179483f59 100644 --- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py +++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py @@ -1,36 +1,12 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Optional import torch from megatron.core.inference.batch_dimensions_utils import InferenceBatchDimensions -@dataclass -class MambaInferenceStateConfig: - """ - Config for initializing Mamba model inference state tensors. - - Note that we maintain separate metadata for decode, regular prefill, and - chunked prefill requests because the Mamba kernels do not yet support mixing - these. Once the kernels have been updated we can simplify this code. - """ - - layer_type_list: List[str] - """ - A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. - See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. - """ - - mamba_conv_states_shape: Tuple[int] - """Mamba conv states shape per request.""" - - mamba_ssm_states_shape: Tuple[int] - """Mamba ssm states shape per request.""" - - class MambaMetadata: """Manages the metadata tensors required for Mamba layers during inference.""" diff --git a/megatron/core/inference/contexts/base_context.py b/megatron/core/inference/contexts/base_context.py index 3dfec6de3ad..4f03726fe3d 100644 --- a/megatron/core/inference/contexts/base_context.py +++ b/megatron/core/inference/contexts/base_context.py @@ -2,6 +2,8 @@ import abc +from megatron.core.inference.config import InferenceConfig + class BaseInferenceContext(abc.ABC): """Base class for inference contexts. @@ -10,13 +12,11 @@ class BaseInferenceContext(abc.ABC): Extend this class for any future contexts types. """ - def __init__(self, materialize_only_last_token_logits: bool): + def __init__(self, inference_config: InferenceConfig): """ Args: - materialize_only_last_token_logits (bool): - If True, only the last-token logits will be extracted during decode """ - self.materialize_only_last_token_logits = materialize_only_last_token_logits + self.config = inference_config @abc.abstractmethod def is_static_batching(self) -> bool: diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index 5dc2d503097..e1b55363b37 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -4,22 +4,19 @@ import math import warnings from contextlib import nullcontext -from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple +from typing import List, Optional, Sequence, Tuple -import torch -import torch.nn.functional as F -from packaging.version import Version as PkgVersion -from torch import Tensor +import torch # type: ignore +import torch.nn.functional as F # type: ignore +from torch import Tensor # type: ignore from megatron.core import parallel_state from megatron.core.inference.batch_dimensions_utils import ( CUDAGraphBatchDimensionBuilder, InferenceBatchDimensions, ) +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.inference_request import DynamicInferenceRequest -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.unified_memory import ( UnifiedMemoryUnsupportedError, @@ -28,13 +25,13 @@ from megatron.core.inference.utils import tensor_swap from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb from megatron.core.package_info import __version__ as mcore_version -from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list -from megatron.core.transformer import TransformerConfig +from megatron.core.transformer import MLATransformerConfig, TransformerConfig +from megatron.core.utils import deprecate_args from megatron.core.utils import divide as core_divide -from megatron.core.utils import get_attr_wrapped_model, get_pg_size, internal_api +from megatron.core.utils import get_pg_size, internal_api -from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata +from .attention_context.mamba_metadata import MambaMetadata from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata from .base_context import BaseInferenceContext from .dynamic_block_allocator import BlockAllocator @@ -45,14 +42,7 @@ triton_append_key_value_cache = None try: - from packaging.version import Version as PkgVersion - - HAVE_PACKAGING = True -except: - HAVE_PACKAGING = False - -try: - import flashinfer # pylint: disable=unused-import + import flashinfer # type: ignore # pylint: disable=unused-import HAVE_FLASHINFER = True except ImportError: @@ -66,16 +56,36 @@ except ImportError: HAVE_TORCH_MEMORY_SAVER = False -try: - import wandb # pylint: disable=unused-import - - HAVE_WANDB = True -except ImportError: - HAVE_WANDB = False - wandb = None - -if TYPE_CHECKING: - import wandb as WandbModule +DEPRECATED_ARGS = [ + "params_dtype", + "num_layers", + "kv_channels", + "num_attention_heads", + "max_sequence_length", + "buffer_size_gb", + "paused_buffer_size_gb", + "max_requests", + "max_tokens", + "block_size_tokens", + "tensor_model_parallel_size", + "pipeline_model_parallel_size", + "pg_collection", + "cache_mla_latent", + "kv_lora_rank", + "qk_pos_emb_head_dim", + "num_cuda_graphs", + "materialize_only_last_token_logits", + "mamba_inference_state_config", + "use_cuda_graphs_for_non_decode_steps", + "use_flashinfer_fused_rope", + "unified_memory_level", + "cuda_graph_max_tokens", + "cuda_graph_mixed_prefill_count", + "metrics_writer", + "request_metadata_types", + "persist_cuda_graphs", + "offload_kv_cache", +] class ContextOverflowError(Exception): @@ -213,130 +223,45 @@ class DynamicInferenceContext(BaseInferenceContext): given step, any unassigned blocks equate to unused space. Args: - params_dtype (torch.dtype): Dtype used for KV cache. - num_layers (int): Number of layers on this pipeline parallel rank. - kv_channels (int): Hidden dimension per attention head. - num_attention_heads (int): Number of attention heads. - max_sequence_length (int): Max possible sequence length (prompt + output) - that will occur. - buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache. - if `unified_memory_level` >= 1, then CPU memory is additionally - utilized, resulting in a total buffer size of `buffer_size_gb + - paused_buffer_size_gb`. - paused_buffer_size_gb (float | None): Portion of buffer reserved for - paused requests. Active requests are paused when there are not enough - active blocks available to continue generating a request. The total - buffer size (active + paused) depends on `unified_memory_level` (uvm): - - uvm 0: buffer_size_gb (paused buffer is inclusive) - - uvm 1: buffer_size_gb + paused_buffer_size_gb - max_requests (int): Max number of active requests to use for - decode-only forward passes. This value is primarily limited by the - combination of `buffer_size_gb` and `max_sequence_length`. - max_tokens (int): Max number of tokens to use for forward passes. This is - primarily limited by prefill activation memory usage. (Defaults to - 16384). - block_size_tokens (int): Size of KV cache block size. - tensor_model_parallel_size (Optional[int]): Tensor model parallel size. - num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture, - where the cuda graph batch sizes range from 1 to `max_requests` - (as computed below). Due to rounding, the actual number of cuda graphs - may not equal this argument. - materialize_only_last_token_logits (Optional[bool]): Whether to only - materialize logits for the last token. This should be set to False - if returning log probs. - mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba - inference state config if the model is a hybrid model. - use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode - engine steps. - unified_memory_level (Optional[int]): Set unified memory usage within the - dynamic inference context. The levels are: 0) no unified memory, 1) - allocate `memory_buffer` in unified memory. Eventually, additional - levels will be included to control other tensors within the context. - use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation. - If None, defaults to using flash-infer if available. - metrics_writer (Optional['WandbModule']): Wandb module for writing metrics. - request_metadata_types (Optional[List[Tuple[str, torch.dtype, bool]]]): A list of the - per-request metadata types to track. Each entry is a tuple consisting of the string - label, the target dtype, and whether to store the data on GPU. + model_config (TransformerConfig): Model config. + inference_config (InferenceConfig): Inference config. """ DEFAULT_MAX_TOKENS = 16384 TOKEN_ROUNDER = 64 REQUEST_ROUNDER = 4 - def __init__( - self, - *, - params_dtype: torch.dtype, - num_layers: int, - kv_channels: int, - num_attention_heads: int, - max_sequence_length: int, - buffer_size_gb: float, - paused_buffer_size_gb: float | None = None, - max_requests: int = None, - max_tokens: int = DEFAULT_MAX_TOKENS, - block_size_tokens: int = 256, - tensor_model_parallel_size: Optional[int] = None, - pipeline_model_parallel_size: Optional[int] = None, - pg_collection: Optional[ProcessGroupCollection] = None, - cache_mla_latent: bool = False, - kv_lora_rank: Optional[int] = None, - qk_pos_emb_head_dim: Optional[int] = None, - num_cuda_graphs: Optional[int] = None, - materialize_only_last_token_logits: Optional[bool] = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, - use_cuda_graphs_for_non_decode_steps: bool = True, - use_flashinfer_fused_rope: bool = False, - unified_memory_level: Optional[int] = 0, - cuda_graph_max_tokens: Optional[int] = None, - cuda_graph_mixed_prefill_count: Optional[int] = 16, - metrics_writer: Optional['WandbModule'] = None, - request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None, - persist_cuda_graphs: Optional[bool] = False, - offload_kv_cache: Optional[bool] = False, - ): - super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits) - - self.cache_mla_latent = cache_mla_latent + @deprecate_args( + *DEPRECATED_ARGS, + message=( + "Argument `{name}` has been deprecated. " + "Only pass `model_config` and `inference_config`" + ), + ) + def __init__(self, model_config: TransformerConfig, inference_config: InferenceConfig): + super().__init__(inference_config=inference_config) + + self.cache_mla_latent = ( + isinstance(model_config, MLATransformerConfig) and model_config.cache_mla_latents + ) if self.cache_mla_latent: assert ( - block_size_tokens == 64 + inference_config.block_size_tokens == 64 ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert" - # give deprecated args warning for cuda_graph_max_tokens - if cuda_graph_max_tokens is not None: - warnings.warn( - "`cuda_graph_max_tokens` is deprecated and will be removed in a future release. " - "The context now automatically sets the max tokens for cuda graphs based on " - "`max_requests`.", - DeprecationWarning, - ) - - self.metrics_writer = metrics_writer - # Per partition num heads and hidden size. - projection_size = kv_channels * num_attention_heads - if tensor_model_parallel_size is None: - tp_size = ( - get_pg_size(pg_collection.tp) - if pg_collection is not None - else parallel_state.get_tensor_model_parallel_world_size() - ) + num_attention_heads = model_config.num_query_groups or model_config.num_attention_heads + projection_size = model_config.kv_channels * num_attention_heads + pg_collection = inference_config.pg_collection + if pg_collection is not None: + tp_size = get_pg_size(pg_collection.tp) + pp_size = get_pg_size(pg_collection.pp) else: - tp_size = tensor_model_parallel_size + tp_size = model_config.tensor_model_parallel_size + pp_size = model_config.pipeline_model_parallel_size self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) - if pipeline_model_parallel_size is None: - pp_size = ( - get_pg_size(pg_collection.pp) - if pg_collection is not None - else parallel_state.get_pipeline_model_parallel_world_size() - ) - else: - pp_size = pipeline_model_parallel_size - # Cache the PP group we should use for PP collectives inside the context. # If the model provides a pg_collection with a pp group, prefer it. # Otherwise: @@ -357,6 +282,7 @@ def __init__( self.expert_model_parallel_group = None # Mamba states. + mamba_inference_state_config = inference_config.mamba_inference_state_config self.is_hybrid_model = mamba_inference_state_config is not None if self.is_hybrid_model: mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape @@ -381,7 +307,7 @@ def __init__( self.layer_map = attention_layer_map | mamba_layer_map else: # The layer map is the identity function for pure Transformer models. - self.num_attention_layers = num_layers + self.num_attention_layers = model_config.num_layers // pp_size self.num_mamba_layers = 0 (self.mamba_conv_states_shape, self.mamba_ssm_states_shape) = (None, None) self.layer_map = {i: i for i in range(self.num_attention_layers)} @@ -392,11 +318,11 @@ def __init__( ) # Block size tokens, bytes. - dtype_size_bytes = params_dtype.itemsize - self.block_size_tokens = block_size_tokens + dtype_size_bytes = model_config.params_dtype.itemsize + self.block_size_tokens = inference_config.block_size_tokens if self.cache_mla_latent: # one vector c_t (rank) + optional RoPE phase slice - self.kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim + self.kv_reduced_dim = model_config.kv_lora_rank + model_config.qk_pos_emb_head_dim self.block_size_bytes = ( dtype_size_bytes * self.num_attention_layers @@ -422,9 +348,9 @@ def __init__( mamba_states_memory_per_request *= dtype_size_bytes # Unified memory. - self.unified_memory_level = unified_memory_level - self.persist_cuda_graphs = persist_cuda_graphs - if unified_memory_level > 0: + self.unified_memory_level = inference_config.unified_memory_level + self.persist_cuda_graphs = inference_config.persist_cuda_graphs + if self.unified_memory_level > 0: try: self.unified_memory_mempool = create_unified_mempool() except UnifiedMemoryUnsupportedError: @@ -435,9 +361,11 @@ def __init__( self.unified_memory_level = 0 # Initialize block allocator. - buffer_size_bytes = int(buffer_size_gb * 1024**3) + buffer_size_bytes = int(inference_config.buffer_size_gb * 1024**3) paused_buffer_size_bytes = ( - 0 if paused_buffer_size_gb is None else int(paused_buffer_size_gb * 1024**3) + 0 + if inference_config.paused_buffer_size_gb is None + else int(inference_config.paused_buffer_size_gb * 1024**3) ) # TODO: Add parameter to control fraction of memory assigned to KV cache # versus Mamba state. @@ -453,14 +381,17 @@ def __init__( # (i.e., divergence in the scheduling behavior). if pp_size > 1: block_count_tensor = torch.tensor( - block_count, dtype=torch.int32, device=torch.cuda.current_device() + (block_count, paused_block_count), + dtype=torch.int32, + device=torch.cuda.current_device(), ) torch.distributed.all_reduce( block_count_tensor, op=torch.distributed.ReduceOp.MIN, group=self.pipeline_parallel_group, ) - block_count = block_count_tensor.item() + block_count = block_count_tensor[0].item() + paused_block_count = block_count_tensor[1].item() self.block_allocator = BlockAllocator( context=self, @@ -471,13 +402,14 @@ def __init__( ) # Track request metadata. + request_metadata_types = inference_config.request_metadata_types if request_metadata_types is None: request_metadata_types = DynamicInferenceRequest.get_metadata_types() self.request_metadata_types = request_metadata_types # Initialize context state. - self.params_dtype = params_dtype - self.max_sequence_length = max_sequence_length + self.params_dtype = model_config.params_dtype + self.max_sequence_length = inference_config.max_sequence_length # Request and token counts. self.total_request_count = 0 @@ -497,16 +429,16 @@ def __init__( self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens) # Set max_requests, max_tokens. - if max_requests is None: + if inference_config.max_requests is None: # Maximize compute utilization by defaulting to 1 block per request. self.max_requests = self.block_allocator.total_count - 1 # -1 for dummy block self.max_requests = self.max_requests // tp_size * tp_size self.max_requests = self.max_requests // self.REQUEST_ROUNDER * self.REQUEST_ROUNDER else: # User can control request overflow via max_requests. - self.max_requests = max_requests + self.max_requests = inference_config.max_requests - self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS + self.max_tokens = inference_config.max_tokens or self.DEFAULT_MAX_TOKENS assert self.max_tokens >= self.max_requests, ( f"max_tokens ({self.max_tokens}) must be >= " @@ -538,37 +470,39 @@ def __init__( ) # CUDA graph config list + self.use_cuda_graphs_for_non_decode_steps = ( + inference_config.use_cuda_graphs_for_non_decode_steps + ) self.cuda_graph_batch_dimensions_list, self.cuda_graph_token_counts = ( CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list( tp_size=tp_size, - num_cuda_graphs=num_cuda_graphs, + num_cuda_graphs=inference_config.num_cuda_graphs, cuda_graph_max_tokens=self.max_requests, - cuda_graph_mixed_prefill_count=cuda_graph_mixed_prefill_count, + cuda_graph_mixed_prefill_count=inference_config.cuda_graph_mixed_prefill_count, max_requests=self.max_requests, max_tokens=self.max_tokens, max_sequence_length=self.max_sequence_length, - use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps, + use_cuda_graphs_for_non_decode_steps=self.use_cuda_graphs_for_non_decode_steps, ) ) # Whether to offload the KV cache. Determines where the KV cache is allocated within memory. - self.offload_kv_cache = offload_kv_cache + self.offload_kv_cache = inference_config.offload_kv_cache assert not ( self.offload_kv_cache and self.unified_memory_level ), "The KV cache should not be instantiated in unified memory when it is offloaded during training." self._using_cuda_graph_this_step = False - self.use_cuda_graphs_for_non_decode_steps = use_cuda_graphs_for_non_decode_steps # Deal with chunked prefill self.chunked_prefill_request_id = -1 self.has_explicit_chunked_prefill_req = False # FlashInfer. - if use_flashinfer_fused_rope is True: + if inference_config.use_flashinfer_fused_rope is True: assert HAVE_FLASHINFER, "flashinfer is not installed" - elif use_flashinfer_fused_rope is None: - use_flashinfer_fused_rope = HAVE_FLASHINFER - self.use_flashinfer_fused_rope = use_flashinfer_fused_rope + elif inference_config.use_flashinfer_fused_rope is None: + inference_config.use_flashinfer_fused_rope = HAVE_FLASHINFER + self.use_flashinfer_fused_rope = inference_config.use_flashinfer_fused_rope # Allocate GPU state. self.is_tensor_state_allocated = False @@ -756,14 +690,7 @@ def deallocate_all_tensors(self): @classmethod def round_up_tokens(cls, value, tp_size=None): - """Round up to nearest multiple of `TOKEN_ROUNDER` (above) that is also divisible by tensor model parallel size.""" - if not HAVE_PACKAGING: - raise ImportError( - "`packaging` is required for this functionality, please install it with `pip install packaging`" - ) - if PkgVersion(mcore_version) < PkgVersion("0.13"): - return cls.round_up(value) - + """Round up to nearest multiple of `TOKEN_ROUNDER` that is also divisible by tensor model parallel size.""" # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -775,72 +702,9 @@ def round_up_tokens(cls, value, tp_size=None): return token_rounder * int(math.ceil(int(value) / token_rounder)) - @classmethod - def from_config( - cls, - inference_config: InferenceWrapperConfig, - model, - max_batch_size: int, - buffer_size_gb: float = 40, - num_cuda_graphs: int = None, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, - unified_memory_level: int = 0, - ): - """ - Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`. - """ - # TODO: Add other necessary configs from inference_config - - # Max sequence length. - position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") - model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") - inf_max_seq_len = inference_config.inference_max_seq_length - - if position_embedding_type == "learned_absolute": - # When using absolute position embeddings, it is critical that the - # context's `max_sequence_length` is less than or equal to the model's - # `max_sequence_length`. Otherwise, the context's `position_ids` will - # contain ids greater than the dimension of the position embedding - # tensor, which will result in an index error. - if inf_max_seq_len: - max_sequence_length = min(model_max_seq_len, inf_max_seq_len) - else: - max_sequence_length = model_max_seq_len - assert max_batch_size <= model_max_seq_len - else: - max_sequence_length = ( - inference_config.inference_max_seq_length or model_config.max_sequence_length - ) - max_sequence_length = max(max_sequence_length, max_batch_size) - - # Context. - model_config = model.config - return cls( - params_dtype=inference_config.params_dtype, - num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size, - kv_channels=model_config.kv_channels, - num_attention_heads=model_config.num_query_groups, - tensor_model_parallel_size=model_config.tensor_model_parallel_size, - pipeline_model_parallel_size=model_config.pipeline_model_parallel_size, - max_sequence_length=max_sequence_length, - buffer_size_gb=buffer_size_gb, - materialize_only_last_token_logits=False, - num_cuda_graphs=num_cuda_graphs, - use_flashinfer_fused_rope=None, - mamba_inference_state_config=mamba_inference_state_config, - unified_memory_level=unified_memory_level, - ) - @classmethod def round_up_requests(cls, value, tp_size=None): - """Round up to nearest multiple of `REQUEST_ROUNDER` (above) that is also divisible by tensor model parallel size.""" - if not HAVE_PACKAGING: - raise ImportError( - "`packaging` is required for this functionality, please install it with `pip install packaging`" - ) - if PkgVersion(mcore_version) < PkgVersion("0.13"): - return cls.round_up(value) - + """Round up to nearest multiple of `REQUEST_ROUNDER` that is also divisible by tensor model parallel size.""" # Make sure divisible by TP size if tp_size is None: # Check if parallel state is initialized before trying to get TP size @@ -852,16 +716,6 @@ def round_up_requests(cls, value, tp_size=None): return request_rounder * int(math.ceil(int(value) / request_rounder)) - @classmethod - def round_up(cls, value): - """Deprecated in favor of round_up_tokens and round_up_requests.""" - warnings.warn( - "`round_up` is deprecated in favor of `round_up_tokens` or `round_up_requests` " - "and will be removed in `megatron-core` 0.14." - ) - ROUNDER = getattr(cls, "ROUNDER", 64) - return ROUNDER * int(math.ceil(int(value) / ROUNDER)) - def is_static_batching(self) -> bool: """Is static batching? False.""" return False @@ -882,6 +736,7 @@ def has_unfinished_requests(self) -> bool: def cu_query_lengths(self) -> Tuple[Tensor, int]: """Cumulative query sequence lengths.""" + assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"], @@ -889,6 +744,7 @@ def cu_query_lengths(self) -> Tuple[Tensor, int]: def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]: """Cumulative key/value sequence lengths.""" + assert self.active_attn_metadata is not None return ( self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"], self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"], @@ -958,18 +814,20 @@ def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) : self.padded_active_token_count ] - def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]: + def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional[Tensor], Tensor]: """Read from KV cache. Args: layer_number (int): Layer number. Return: - (Tuple[Tensor, Tensor]) The key and value pointer tensors that point - to blocks within the block-level memory buffer. + (Tuple[Tensor, Tensor, Tensor]) The key and value pointer tensors that point + to blocks within the block-level memory buffer as well as the block table. """ attention_layer_number = self.layer_map[layer_number - 1] + assert self.active_attn_metadata is not None + if self.cache_mla_latent: return ( self.memory_buffer[attention_layer_number], @@ -1386,9 +1244,9 @@ def initialize_attention_state( ] = 0 self.active_attn_metadata = ( - self.graph_attn_metadata + self.graph_attn_metadata # type: ignore[assignment] if self.using_cuda_graph_this_step() - else self.non_graph_attn_metadata + else self.non_graph_attn_metadata # type: ignore[assignment] ) # Update cu_query_seq_lengths, max_seqlen_q. @@ -1413,6 +1271,7 @@ def initialize_attention_state( has_explicit_chunked_prefill_req=False, ) + assert self.active_attn_metadata is not None self.active_attn_metadata["mha_metadata"].update( request_query_lengths=query_lengths_view, request_kv_length_offsets=request_kv_length_offsets_view, @@ -1545,7 +1404,7 @@ def last_token_logits(self, logits: Tensor) -> Tensor: return last_token_logits - def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool): + def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bool, bool, bool]: """ Check if the request can be added to the context. """ @@ -1784,7 +1643,7 @@ def resume_paused_requests( active_request_count: int, newly_paused_request_ids: torch.Tensor, next_tokens: torch.Tensor, - ) -> tuple[int, int, torch.Tensor]: + ) -> tuple[int, torch.Tensor]: """Resume as many paused requests as we have space for in the active buffer. Args: @@ -1863,7 +1722,7 @@ def resume_paused_requests( def evict_overflow_paused_requests( self, active_request_count: int, next_tokens: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> Optional[tuple[torch.Tensor, torch.Tensor]]: """Evict requests that overflow the paused buffer. Args: diff --git a/megatron/core/inference/contexts/static_context.py b/megatron/core/inference/contexts/static_context.py index 8c83d2f09b3..a15b33c414a 100644 --- a/megatron/core/inference/contexts/static_context.py +++ b/megatron/core/inference/contexts/static_context.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) +from megatron.core.inference.config import InferenceConfig from .base_context import BaseInferenceContext @@ -19,7 +17,8 @@ class StaticInferenceContext(BaseInferenceContext): def __init__( self, max_batch_size: int, max_sequence_length: int, use_flashinfer_fused_rope: bool = None ): - super().__init__(materialize_only_last_token_logits=True) + config = InferenceConfig(materialize_only_last_token_logits=True) + super().__init__(inference_config=config) self.max_sequence_length = max_sequence_length self.max_batch_size = max_batch_size self.sequence_len_offset = 0 @@ -27,13 +26,6 @@ def __init__( self.key_value_memory_dict = {} self.decode_mode = False - @classmethod - def from_config(cls, config: InferenceWrapperConfig) -> "StaticInferenceContext": - """Initialize context from a config.""" - max_batch_size = config.inference_max_requests - max_sequence_length = config.inference_max_seq_length - return cls(max_batch_size, max_sequence_length) - def swap_key_value_dict(self, batch_idx): "swap between batches" if len(self.key_value_memory_dict) == 0: diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 29257b6f982..134ce3b124d 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -42,6 +42,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import delete_cuda_graphs from megatron.core.utils import ( + deprecate_args, experimental_api, get_asyncio_loop, get_pg_rank, @@ -89,6 +90,14 @@ except ImportError: HAVE_PSUTIL = False +DEPRECATED_ARGS = [ + "enable_cuda_graph", + "random_seed", + "track_paused_request_events", + "enable_chunked_prefill", + "inference_logging_step_interval", + "pg_collection", +] from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER if HAVE_TORCH_MEMORY_SAVER: @@ -136,24 +145,13 @@ class DynamicInferenceEngine(AbstractEngine): outputs and detokenizer the output tokens. inference_context (DynamicInferenceContext): Context for managing in-flight batching and a dynamic block-level KV cache (similar to paged attention). - random_seed (Optional[int]): Use a random seed if you want deterministic - results. Defaults to None. - inference_logging_step_interval (int): The step interval at which to log - inference metrics to wandb. Defaults to 0, which means no logging. """ - def __init__( - self, - controller: TextGenerationController, - context: DynamicInferenceContext, - enable_cuda_graph: Optional[bool] = None, - random_seed: Optional[int] = None, - *, - track_paused_request_events: bool = False, - enable_chunked_prefill: bool = True, - inference_logging_step_interval: int = 0, - pg_collection: Optional[ProcessGroupCollection] = None, - ): + @deprecate_args( + *DEPRECATED_ARGS, + message="Argument `{name}` has been deprecated. Only pass `controller` and `context`", + ) + def __init__(self, controller: TextGenerationController, context: DynamicInferenceContext): assert isinstance( controller, TextGenerationController @@ -161,40 +159,28 @@ def __init__( assert isinstance( context, DynamicInferenceContext ), f"context must be a DynamicInferenceContext, got {type(context)}" - assert isinstance(random_seed, int), f"random_seed must be an int, got {type(random_seed)}" - - # Deprecate `enable_cuda_graph`. - if enable_cuda_graph is not None: - warnings.warn( - "The `enable_cuda_graph` argument is deprecated and will be " - "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " - "read directly from the transformer config object." - ) - self.enable_cuda_graph = enable_cuda_graph - else: - self.enable_cuda_graph = ( - controller.inference_wrapped_model.model.config.enable_cuda_graph - ) - if pg_collection is not None: - self.pg_collection = pg_collection + model_config = controller.inference_wrapped_model.model.config + inference_config = context.config + + if inference_config.pg_collection is not None: + self.pg_collection = inference_config.pg_collection else: self.pg_collection = ProcessGroupCollection.use_mpu_process_groups() # Initialization options. self.controller = controller self.context = context - self.random_seed = random_seed - self.track_paused_request_events = track_paused_request_events - self.enable_chunked_prefill = enable_chunked_prefill - self.inference_logging_step_interval = inference_logging_step_interval - self.unified_memory_level = context.unified_memory_level - self.persist_cuda_graphs = context.persist_cuda_graphs - - if enable_cuda_graph is not None: - self.cuda_graph_impl = "local" if enable_cuda_graph else "none" - else: - self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl + self.track_paused_request_events = inference_config.track_paused_request_events + self.enable_chunked_prefill = inference_config.enable_chunked_prefill + self.metrics_writer = inference_config.metrics_writer + self.logging_step_interval = inference_config.logging_step_interval + self.unified_memory_level = inference_config.unified_memory_level + self.persist_cuda_graphs = inference_config.persist_cuda_graphs + self.materialize_only_last_token_logits = ( + inference_config.materialize_only_last_token_logits + ) + self.cuda_graph_impl = model_config.cuda_graph_impl # Initialize engine. self.reset() @@ -205,12 +191,12 @@ def __init__( ) # Configure wandb to use separate step counter for inference metrics (only once) - if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None: + if self.logging_step_interval > 0 and self.metrics_writer is not None: logging.info( f"\033[1;93m[INFERENCE]\033[0m " f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m" ) - if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": + if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": # Make all inference/* metrics use inference_step as their x-axis # This allows inference and training to have independent step counters context.metrics_writer.define_metric( @@ -288,8 +274,6 @@ def create_cuda_graphs(self, reset_context: bool = True): context = self.context controller = self.controller - config = controller.inference_wrapped_model.inference_wrapper_config - time_start = time.time() mem_stats_start = torch.cuda.memory_stats() @@ -735,7 +719,7 @@ def _add_request( request.sampling_params.return_log_probs and not request.sampling_params.skip_prompt_log_probs ): - assert not self.context.materialize_only_last_token_logits, ( + assert not self.materialize_only_last_token_logits, ( "Prompt log probs cannot be calculated if only last token logits are materialized. " "Set materialize_only_last_token_logits to False in DynamicInferenceContext " "or skip_prompt_log_probs to True in SamplingParams." @@ -928,7 +912,7 @@ def post_process_requests( # For chunked prefill with materialize_only_last_token_logits, discard intermediate log probs if ( request_id == self.context.chunked_prefill_request_id - and self.context.materialize_only_last_token_logits + and self.materialize_only_last_token_logits ): request.prompt_log_probs = [] request.generated_log_probs = [] @@ -1208,10 +1192,10 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]: range_pop() if ( - self.inference_logging_step_interval > 0 + self.logging_step_interval > 0 and self.step_count > 0 - and self.step_count % self.inference_logging_step_interval == 0 - and self.context.metrics_writer is not None + and self.step_count % self.logging_step_interval == 0 + and self.metrics_writer is not None ): kvcache_util_stats = self.context.get_kvcache_utilization_stats() else: @@ -1344,18 +1328,13 @@ async def async_bookkeep( else: metrics[f'inference/{key}'] = value - if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": - self.context.metrics_writer.log(metrics, commit=True) + if HAVE_WANDB and self.metrics_writer.__name__ == "wandb": + self.metrics_writer.log(metrics, commit=True) else: - raise ValueError( - f"Unsupported metrics writer type: {type(self.context.metrics_writer)}" - ) + raise ValueError(f"Unsupported metrics writer type: {type(self.metrics_writer)}") # Print context state. - if ( - self.inference_logging_step_interval > 0 - and step_count % self.inference_logging_step_interval == 0 - ): + if self.logging_step_interval > 0 and step_count % self.logging_step_interval == 0: mem = torch.cuda.memory_stats() step_type = "decode" if context_state["is_decode_only"] else "non-decode" output_str = ( diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py index d4c61965d2b..5ae37d5967e 100644 --- a/megatron/core/inference/engines/static_engine.py +++ b/megatron/core/inference/engines/static_engine.py @@ -8,7 +8,8 @@ import torch from megatron.core.inference.async_stream import AsyncStream -from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.inference_request import InferenceRequest @@ -17,7 +18,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model +from megatron.core.utils import get_asyncio_loop try: from tqdm import tqdm @@ -42,8 +43,6 @@ class StaticInferenceEngine(AbstractEngine): controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. max_batch_size (int, optional): The maximum number of requests to process at once. - Will be set from the InferenceWrapperConfig in `text_generation_controller` by - default. random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None. """ @@ -69,53 +68,55 @@ def __init__( DeprecationWarning, ) - inference_wrapper_config = ( - text_generation_controller.inference_wrapped_model.inference_wrapper_config - ) self.controller = text_generation_controller + self.inference_wrapped_model = self.controller.inference_wrapped_model + self.config = self.inference_wrapped_model.config self.random_seed = random_seed or 1234 - inference_max_batch_size = inference_wrapper_config.inference_max_requests + # Store original context in case we need to fall back to legacy static engine + original_context = self.inference_wrapped_model.inference_context + assert original_context is not None + assert isinstance(original_context, StaticInferenceContext) + if max_batch_size is None: - max_batch_size = inference_max_batch_size - elif max_batch_size > inference_max_batch_size: + max_batch_size = original_context.max_batch_size + elif max_batch_size > original_context.max_batch_size: warnings.warn( f"Engine `max_batch_size` ({max_batch_size}) > " - f"`inference_max_requests` in `inference_wrapper_config` " - f"({inference_max_batch_size}); setting `max_batch_size` to " - f"{inference_max_batch_size}", + f"`context.max_batch_size` in `inference_wrapped_model.inference_context` " + f"({original_context.max_batch_size}); setting `max_batch_size` to " + f"{original_context.max_batch_size}", UserWarning, ) - max_batch_size = inference_max_batch_size + max_batch_size = original_context.max_batch_size self.scheduler = Scheduler(max_batch_size=max_batch_size) - # Store original context in case we need to fall back to legacy static engine - original_context = text_generation_controller.inference_wrapped_model.inference_context - - mamba_inference_state_config = get_mamba_inference_state_config_from_model( - text_generation_controller.inference_wrapped_model.model + mamba_inference_state_config = MambaInferenceStateConfig.from_model( + self.inference_wrapped_model.model ) try: if not legacy: - dynamic_context = DynamicInferenceContext.from_config( - inference_config=inference_wrapper_config, - model=text_generation_controller.inference_wrapped_model.model, - max_batch_size=max_batch_size, - buffer_size_gb=buffer_size_gb, - num_cuda_graphs=1, - mamba_inference_state_config=mamba_inference_state_config, + dynamic_context = DynamicInferenceContext( + model_config=self.config, + inference_config=InferenceConfig( + max_sequence_length=original_context.max_sequence_length, + buffer_size_gb=buffer_size_gb, + mamba_inference_state_config=mamba_inference_state_config, + max_requests=max_batch_size, + num_cuda_graphs=1, + block_size_tokens=256, + unified_memory_level=0, + ), ) + self.controller.inference_wrapped_model.inference_context = dynamic_context self.controller.inference_wrapped_model.prep_model_for_inference() self.controller._init_dynamic_sampling_tensors() self.dynamic_engine = DynamicInferenceEngine( - controller=self.controller, - random_seed=self.random_seed, - context=dynamic_context, - enable_cuda_graph=True, + controller=self.controller, context=dynamic_context ) except Exception as e: # Get exception details for better debugging @@ -229,13 +230,20 @@ def generate_using_dynamic_engine( if prompts: if add_BOS: sampling_params.add_BOS = True - return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) + request_records = self.dynamic_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) elif inference_requests: prompts = [request.prompt for request in inference_requests] sampling_params = inference_requests[0].sampling_params if add_BOS: sampling_params.add_BOS = True - return self.dynamic_engine.generate(prompts=prompts, sampling_params=sampling_params) + request_records = self.dynamic_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) + + # Return the underlying `InferenceRequest` objects from the `DynamicInferenceRequestRecord`s. + return [record.merge() for record in request_records] def generate_using_legacy_static_engine( self, diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py index 6a17de685bf..b5b24f1f5fe 100644 --- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py @@ -1,8 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import abc -import math -import warnings from typing import Any, Dict, Iterable, Optional, Union import torch @@ -15,103 +13,67 @@ send_to_next_pipeline_rank, ) from megatron.core.inference.contexts import BaseInferenceContext -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.utils import get_attr_wrapped_model, get_model_config +from megatron.core.utils import deprecate_args, get_attr_wrapped_model, get_model_config + +DEPRECATED_ARGS = ["inference_wrapper_config", "pg_collection"] -# pylint: disable=line-too-long class AbstractModelInferenceWrapper(abc.ABC): """Abstract inference wrapper Extend this to create a version for your model. - The wrapper prepares the model for inference, provides the required input data and runs the forward pass. + The wrapper prepares the model for inference, provides the required input data and + runs the forward pass. Args: model (Union[GPTModel, LegacyGPTModel]): The actual GPT model (MCore or MLM). - inference_wrapper_config (InferenceWrapperConfig): Has info like - hidden size, vocab size etc. inference_context (BaseInferenceContext): Context for managing KV cache and other inference params. - pg_collection (ProcessGroupCollection): Process groups for model communication. """ + @deprecate_args(*DEPRECATED_ARGS) def __init__( self, model: Union['LegacyGPTModel', GPTModel], # type: ignore[name-defined] - inference_wrapper_config: InferenceWrapperConfig, - inference_context: Optional[BaseInferenceContext] = None, - pg_collection: Optional[ProcessGroupCollection] = None, + inference_context: BaseInferenceContext, ): assert not isinstance( model, Iterable ), 'interleaving schedule is not supported for inference' self.model = model - self.inference_wrapper_config = inference_wrapper_config + self.config = get_model_config(self.model) self.pipeline_communication_dtype = ( - torch.float - if self.inference_wrapper_config.fp32_residual_connection - else self.inference_wrapper_config.params_dtype + torch.float if self.config.fp32_residual_connection else self.config.params_dtype ) - model_config = get_model_config(self.model) - self.sequence_parallel = model_config.sequence_parallel - - if inference_context is None: - warnings.warn( - "`inference_context` must be passed in as an argument starting in `megatron-core` 0.13." - ) - from megatron.core.inference.contexts import StaticInferenceContext - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + self.sequence_parallel = self.config.sequence_parallel self.inference_context = inference_context - if pg_collection is None: + # Get the inference pg_collection from the config if it exists; otherwise the training + # pg_collection might be used during RL + if (pg_collection := self.inference_context.config.pg_collection) is None: pg_collection = ProcessGroupCollection.use_mpu_process_groups() self.tp_group = pg_collection.tp self.pp_group = pg_collection.pp self.tp_size = torch.distributed.get_world_size(self.tp_group) - if self.inference_wrapper_config.fp8 is not None: + if self.config.fp8 is not None: self.model = prepare_model_for_fp8_inference(self.model) - @property - def inference_params(self): - """Getter for deprecated `inference_params`.""" - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." - ) - return self.inference_context + # TODO(ksanthanam): Add support for fp4 - @inference_params.setter - def inference_params(self, value): - """Setter for deprecated `inference_params`.""" - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be removed in `megatron-core` 0.13." - ) - self.inference_context = value - - def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tensor] = None): + def prep_model_for_inference(self): """A utility function for preparing model for inference The function gets called once before the auto regressive inference loop. It puts the model in eval mode. - Args: - prompts_tokens (torch.Tensor, optional): Deprecated, will be removed in `megatron-core` 0.13 """ - if prompts_tokens is not None: - warnings.warn( - "Passing `prompts_tokens` is deprecated and this argument will be ignored." - "This parameter will be removed in `megatron-core` 0.13." - ) - self.model.eval() # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True @@ -137,7 +99,9 @@ def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]: def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, Any]: """Returns the input data for inference - This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. + This function gets called iteratively in the inference loop. + It can be used to extract relevant input from the prompt tokens, attention mask etc. + required for each step in inference. """ raise NotImplementedError() @@ -183,15 +147,16 @@ def _get_batch_size_and_seq_len( self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None ): """ - Returns the batch size and sequence length based on the tokens tensor and recv_buffer_seq_len. + Returns the batch size and sequence length based on the tokens tensor and + recv_buffer_seq_len. Args: tokens (torch.Tensor): The input tensor of shape (batch_size, seq_len). recv_buffer_seq_len (int, optional): An optional recv buffer sequence length. Returns: - tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of tokens - and seq_len is either the second dimension or recv_buffer_seq_len. + tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of + tokens and seq_len is either the second dimension or recv_buffer_seq_len. """ batch_size = tokens.shape[0] seq_len = recv_buffer_seq_len if recv_buffer_seq_len is not None else tokens.shape[1] @@ -204,7 +169,7 @@ def _allocate_recv_buffer(self, batch_size, seq_len): # sequence parallelism. Static batching does not support sequence parallelism # except for the MoE layers which is handled separately. seq_len = seq_len // self.tp_size - recv_size = (seq_len, batch_size, self.inference_wrapper_config.hidden_size) + recv_size = (seq_len, batch_size, self.config.hidden_size) return torch.empty( recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device() ) @@ -214,10 +179,12 @@ def forward_pass_without_pipeline_parallel( ) -> torch.Tensor: """Utility to carry out simple forward pass for TP or no model parallel models - Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism. + Runs a very simple forward pass for model. Used in the case of models without any + parallelism or only tensor parallelism. Args: - inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] + inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model + [tokens, position ids, attention mask] Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -228,16 +195,18 @@ def forward_pass_without_pipeline_parallel( return logits - def forward_pass_with_pipeline_parallel_small_input_batch( + def forward_pass_with_pipeline_parallel( self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None ) -> torch.Tensor: - """Utility to carry out forward pass for PP models with very small inputs + """Utility to carry out forward pass for PP models - If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method + TODO: Add support for asynchronous microbatches Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model + [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel + recv buffer. Returns: torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] @@ -268,98 +237,8 @@ def forward_pass_with_pipeline_parallel_small_input_batch( logits = output_tensor # Explicitly cast logits to expected dtype - logits = logits.to(self.inference_wrapper_config.params_dtype) - - return logits - - def forward_pass_with_pipeline_parallel_large_input_batch( - self, inference_input: Dict[str, Any], recv_buffer_seq_len=None - ) -> torch.Tensor: - """Utility to carry out forward pass PP models. - - Runs the forward pass for models which are pipeline parallel. - This is more complex than forward_pass_with_pipeline_parallel_small_input_batch because - this splits the global batch into small micro batches and runs them through the model. - - Args: - inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. - - Returns: - torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] - """ - tokens = inference_input["tokens"] - position_ids = inference_input["position_ids"] - attention_mask = inference_input["attention_mask"] - materialize_only_last_token_logits = ( - self.inference_context.materialize_only_last_token_logits - ) - - micro_batch_size = max( - 1, - self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1), - ) - batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len) - # Round up to account for the last partial micro batch if present - num_micro_batches = math.ceil(batch_size / micro_batch_size) - - logits = None - # Preallocate memory for output logits. - if is_pipeline_last_stage(self.pp_group): - logits_seq_len = 1 if materialize_only_last_token_logits else seq_len - logits = torch.empty( - (batch_size, logits_seq_len, self.inference_wrapper_config.padded_vocab_size), - dtype=self.pipeline_communication_dtype, - device=torch.cuda.current_device(), - ) - - recv_buffer = None - if not is_pipeline_first_stage(self.pp_group): - recv_buffer = self._allocate_recv_buffer(micro_batch_size, seq_len) - for micro_batch_index in range(num_micro_batches): - start = micro_batch_index * micro_batch_size - end = min(start + micro_batch_size, batch_size) - tokens2use = tokens[start:end, ...] - position_ids2use = position_ids[start:end, ...] - current_micro_batch_size = end - start - - # Need to change recv buffer shape for the last partial microbatch (if exists) - if current_micro_batch_size != micro_batch_size: - recv_buffer = self._allocate_recv_buffer(current_micro_batch_size, seq_len) - - if not is_pipeline_first_stage(self.pp_group): - recv_from_prev_pipeline_rank_(recv_buffer, self.pp_group) - - self.model.set_input_tensor(recv_buffer) - - output_tensor = self._forward( - { - "tokens": tokens2use, - "position_ids": position_ids2use, - "attention_mask": attention_mask, - "inference_context": self.inference_context, - } - ) - - if not is_pipeline_last_stage(self.pp_group): - send_to_next_pipeline_rank(output_tensor, self.pp_group) - - self.inference_context.batch_size_offset += current_micro_batch_size - - if is_pipeline_last_stage(self.pp_group): - assert logits is not None - logits[start:end, ...] = output_tensor - - # Explicitly cast logits to expected dtype - if is_pipeline_last_stage(self.pp_group): - assert logits is not None - logits = logits.to(self.inference_wrapper_config.params_dtype) - - # Once done with all micro batches, we reset batch size offset and seq len offset - self.inference_context.increment_sequence_len_offset(seq_len) - self.inference_context.reset_batch_size_offset() + logits = logits.to(self.config.params_dtype) - # NOTE: Only returns the logits on the last pipeline stage return logits @torch.inference_mode() @@ -368,14 +247,18 @@ def run_one_forward_step( ) -> torch.Tensor: """The forward pass of the model for inference - Appropriate utility is called for the forward pass depending on the type of model parallelism used + Appropriate utility is called for the forward pass depending on the type of model + parallelism used Args: - inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask] - recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer. + inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model + [tokens, position ids, attention mask] + recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel + recv buffer. Returns: - torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models. + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. + The logits are returned only in the last pipeline stage for PP models. """ # Check if we are in a PP model if not (is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group)): @@ -383,19 +266,6 @@ def run_one_forward_step( current_batch_size, seq_len = self._get_batch_size_and_seq_len( tokens, recv_buffer_seq_len ) - # If input batch is large, we need to split into micro batches and run the forward pass - if ( - current_batch_size * seq_len - > self.inference_wrapper_config.inference_batch_times_seqlen_threshold - and self.inference_wrapper_config.inference_batch_times_seqlen_threshold != -1 - ): - return self.forward_pass_with_pipeline_parallel_large_input_batch( - inference_input, recv_buffer_seq_len - ) - else: - # If input batch is very small we can do a simple forward pass on the entire global batch - return self.forward_pass_with_pipeline_parallel_small_input_batch( - inference_input, recv_buffer_seq_len - ) + return self.forward_pass_with_pipeline_parallel(inference_input, recv_buffer_seq_len) else: return self.forward_pass_without_pipeline_parallel(inference_input) diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py index ba89fbc2f6c..67cfc1bae48 100644 --- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py @@ -7,14 +7,12 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.utils import get_attention_mask from megatron.core.models.gpt import GPTModel -from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import AttnBackend -from megatron.core.utils import get_model_config +from megatron.core.utils import deprecate_args, get_model_config + +DEPRECATED_ARGS = ["inference_wrapper_config", "pg_collection"] # pylint: disable=line-too-long @@ -25,22 +23,13 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper): Args: model (GPTModel): The GPT model (MCore or legacy) - inference_wrapper_config (InferenceWrapperConfig): Has info like hidden size, vocab - size, etc. inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. - pg_collection (ProcessGroupCollection): Process groups for model communication. - If not provided, defaults to global parallel state groups. """ - def __init__( - self, - model: GPTModel, - inference_wrapper_config: InferenceWrapperConfig, - inference_context: Optional[BaseInferenceContext] = None, - pg_collection: Optional[ProcessGroupCollection] = None, - ): - super().__init__(model, inference_wrapper_config, inference_context, pg_collection) + @deprecate_args(*DEPRECATED_ARGS) + def __init__(self, model: GPTModel, inference_context: Optional[BaseInferenceContext] = None): + super().__init__(model, inference_context) def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]: """Prepares the inference input data. diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py deleted file mode 100644 index 5d89085add2..00000000000 --- a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from dataclasses import dataclass -from typing import Optional - -import torch - - -@dataclass -class InferenceWrapperConfig: - """Config for the model inference wrapper - - NOTE : All the arguments here are obtained from arguments.py file - """ - - hidden_size: int - """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]""" - - params_dtype: torch.dtype - """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used""" - - inference_batch_times_seqlen_threshold: int - """if (batch-size * sequence-length) is smaller than this threshold then we will not pipeline - the batch.""" - - padded_vocab_size: int - """The final padded vocab size (Padded to make it divisible by - --make-vocab-size-divisible-by value)""" - - inference_max_requests: int = 8 - """ Maximum number of requests for inference (prefill & decode). Necessary for CUDA graphs. """ - - inference_max_seq_length: int = 2560 - """ Maximum sequence length for inference (prefill & decode). Necessary for CUDA graphs. """ - - fp32_residual_connection: bool = False - """Move residual connections to fp32. Obtained from arguments.py""" - - nccl_all_reduce_for_prefill: bool = False - """When using symmetric all reduce kernels we keep the default all reduces for nccl. - This can be more effecient for large prefill sizes""" - - fp8: Optional[str] = None - """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined - choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 - activation and weight tensors and e5m2 for all FP8 output activation gradient tensors.""" - - moe_pad_experts_for_cuda_graph_inference: bool = False - """Some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router - will switch to dropping and padding during decode time which does not have a D2H sync. The - capacity factor is set to the max that an expert could see during inference so no tokens are - actually dropped. """ - - def add_attributes(self, attribute_value_pair: dict): - """Utility to add more attributes to inference params - - Use this method to pass in a custom dictionary to add more configs to the instance created. - Use as follows: - c = InferenceWrapperConfig - c.add_attributes({'precision':'fp32'}) - - Args: - attribute_value_pair (dict): A dictionary containing attributes as the key names and - corresponding values. - """ - for key, value in attribute_value_pair.items(): - setattr(self, key, value) diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py index 2ae1e2ade6f..c773ab507a3 100644 --- a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py @@ -11,9 +11,6 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.T5 import T5Model from megatron.core.utils import get_attr_wrapped_model @@ -27,7 +24,6 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): Args: model (T5Model): The T5 model (MCore or legacy) - inference_wrapper_config (InferenceWrapperConfig): The command line arguments that were passed inference_context (BaseInferenceContext): Manages KV cache, and tracks sequence/token/batch offsets. use_local (bool): Whether the T5 model's transformer impl @@ -37,11 +33,10 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper): def __init__( self, model: T5Model, - inference_wrapper_config: InferenceWrapperConfig, inference_context: Optional[BaseInferenceContext] = None, use_local: bool = False, ): - super().__init__(model, inference_wrapper_config, inference_context) + super().__init__(model, inference_context) self.use_local = use_local def prep_inference_input( diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py deleted file mode 100644 index 340cadb48a9..00000000000 --- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import - TextGenerationController as SimpleTextGenerationController, -) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index a5233983ed0..617883414d4 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -11,21 +11,22 @@ import torch import torch.nn.functional as F from torch import Tensor -from torch.distributed import ProcessGroup +from megatron.core import parallel_state from megatron.core.inference.async_stream import AsyncStream from megatron.core.inference.communication_utils import ( broadcast_from_last_pipeline_stage, - is_pipeline_first_stage, is_pipeline_last_stage, ) from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError +from megatron.core.inference.contexts.static_context import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding +from megatron.core.models.multimodal.llava_model import LLaVAModel from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel @@ -52,28 +53,32 @@ class TextGenerationController: inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts - pp_group (ProcessGroup): Process group for pipeline parallelism """ - def __init__( - self, - inference_wrapped_model: AbstractModelInferenceWrapper, - tokenizer, - pp_group: ProcessGroup = None, - ): + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): self.inference_wrapped_model = inference_wrapped_model + self.model_config = self.inference_wrapped_model.model.config + inference_config = self.inference_wrapped_model.inference_context.config self.tokenizer = tokenizer - self.pp_group = pp_group + pg_collection = inference_config.pg_collection + if pg_collection is not None: + self.pp_group = pg_collection.pp + else: + self.pp_group = parallel_state.get_pipeline_model_parallel_group() + + self.model_is_pipeline_parallel = self.model_config.pipeline_model_parallel_size > 1 - # For models without pipeline parallelism, is_first_stage and is_last_stage returns True - self.model_is_pipeline_parallel = not ( - is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group) - ) + # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. + # TODO(ksanthanam): Consider deprecating this check if LLaVAModel is no longer used + unwrapped_model = unwrap_model(self.inference_wrapped_model.model) + if isinstance(unwrapped_model, LLaVAModel): + self.vocab_size = unwrapped_model.language_model.vocab_size + else: + self.vocab_size = unwrapped_model.vocab_size - model_config = get_model_config(self.inference_wrapped_model.model) self.sampling_rng = torch.Generator(device=torch.cuda.current_device()) - self.sampling_rng.manual_seed(model_config.inference_sampling_seed) + self.sampling_rng.manual_seed(self.model_config.inference_sampling_seed) if self.inference_wrapped_model.inference_context.is_dynamic_batching(): self._init_dynamic_sampling_tensors() @@ -98,9 +103,7 @@ def _init_dynamic_sampling_tensors(self): self._get_stop_word_finished_ids_callback = None device = torch.cuda.current_device() - logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype - # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. - vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size + logits_dtype = self.inference_wrapped_model.config.params_dtype self._sampling_backend = "torch" self._sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) @@ -505,7 +508,6 @@ def _dynamic_step_context_init( position_ids (Tensor): The active position IDs. """ context = self.inference_wrapped_model.inference_context - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config active_request_slice = slice(context.paused_request_count, context.total_request_count) # Remove Float16Module wrapper if it exists @@ -517,11 +519,11 @@ def _dynamic_step_context_init( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill + symmetric_ar_type = self.model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill # Turning on/off MoE padding for cuda-graphs moe_pad_experts_for_cuda_graph_inference = ( - inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference + self.model_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: if context.using_cuda_graph_this_step(): @@ -569,8 +571,6 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) input_ids (Tensor): The input token IDs. position_ids (Tensor): The position IDs. """ - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - context = self.inference_wrapped_model.inference_context active_request_count = context.total_request_count - context.paused_request_count @@ -582,18 +582,17 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) if self.model_is_pipeline_parallel: logits_seq_len = ( active_request_count - if context.materialize_only_last_token_logits + if context.config.materialize_only_last_token_logits else input_ids.shape[1] ) - vocab_size = inference_wrapper_config.padded_vocab_size - logits_shape = [1, logits_seq_len, vocab_size] + logits_shape = [1, logits_seq_len, self.vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape logits = broadcast_from_last_pipeline_stage( logits_shape, - dtype=inference_wrapper_config.params_dtype, + dtype=self.model_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -639,7 +638,7 @@ def _dynamic_step_sample_logits(self, logits: Tensor): # Last token logits. context = self.inference_wrapped_model.inference_context - if context.materialize_only_last_token_logits: + if context.config.materialize_only_last_token_logits: # When materialize_only_last_token_logits is true, last_token_logits is # already called in the forward pass of GPT. last_token_logits = logits.squeeze(0) @@ -684,7 +683,7 @@ def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]: return context.calculate_log_probs( logits, self._sampled_tokens_cuda[:active_request_count], - only_last_token_logits=context.materialize_only_last_token_logits, + only_last_token_logits=context.config.materialize_only_last_token_logits, ) def _dynamic_step_calculate_top_n_logprobs( @@ -712,7 +711,7 @@ def _dynamic_step_calculate_top_n_logprobs( active_request_slice = slice(context.paused_request_count, context.total_request_count) # Handle decode-only mode (only last token) - if context.materialize_only_last_token_logits or context.is_decode_only(): + if context.config.materialize_only_last_token_logits or context.is_decode_only(): # In decode mode or when only last token logits are materialized, # logits already represent only the last tokens log_probs = log_probs_tensor[:active_request_count] @@ -1024,9 +1023,10 @@ def generate_all_output_tokens_static_batch( # Pad batch tokens if necessary batch_size = len(active_requests) max_sequence_length = max_prompt_length_in_batch + sampling_params.num_tokens_to_generate - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - inference_max_batch_size = inference_wrapper_config.inference_max_requests - inference_max_sequence_length = inference_wrapper_config.inference_max_seq_length + context = self.inference_wrapped_model.inference_context + assert isinstance(context, StaticInferenceContext) + inference_max_batch_size = context.max_batch_size + inference_max_sequence_length = context.max_sequence_length padded_batch_size = inference_max_batch_size if enable_cuda_graph else batch_size if padded_batch_size > inference_max_batch_size: raise ValueError( @@ -1066,10 +1066,6 @@ def generate_all_output_tokens_static_batch( batch_size, device=torch.cuda.current_device() ).cuda() - # Use padded vocab size because tokenizer vocab size might not include padding - # to nearest power of 2 - vocab_size = inference_wrapper_config.padded_vocab_size - # Check whether early termination is enabled no_early_termination = getattr(sampling_params, "no_early_termination", False) termination_id = -1 if no_early_termination else self.tokenizer.eod @@ -1130,14 +1126,14 @@ def generate_all_output_tokens_static_batch( # If using symmetric kernels and we are using using nccl # for prefill turn off symmetric kernels - symmetric_ar_type = model_config.symmetric_ar_type - nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill + symmetric_ar_type = self.model_config.symmetric_ar_type + nccl_all_reduce_for_prefill = self.model_config.nccl_all_reduce_for_prefill if symmetric_ar_type is not None and nccl_all_reduce_for_prefill: unwrapped_model.set_symmetric_ar(None) # Turning off MoE padding for prefill moe_pad_experts_for_cuda_graph_inference = ( - inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference + self.model_config.moe_pad_experts_for_cuda_graph_inference ) if moe_pad_experts_for_cuda_graph_inference: set_decode_expert_padding(unwrapped_model, False) @@ -1191,7 +1187,7 @@ def generate_all_output_tokens_static_batch( or not (sampling_params.return_log_probs or sampling_params.top_n_logprobs > 0) ) inference_context = self.inference_wrapped_model.inference_context - inference_context.materialize_only_last_token_logits = ( + inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -1212,14 +1208,14 @@ def generate_all_output_tokens_static_batch( if self.model_is_pipeline_parallel: context_length = context_end_position - context_start_position logits_seq_len = 1 if materialize_only_last_token_logits else context_length - logits_shape = [batch_size, logits_seq_len, vocab_size] + logits_shape = [batch_size, logits_seq_len, self.vocab_size] if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank # and then broadcast the sampled tokens rather than broadcasting the raw logits. logits = broadcast_from_last_pipeline_stage( - [batch_size, logits_seq_len, vocab_size], - dtype=inference_wrapper_config.params_dtype, + [batch_size, logits_seq_len, self.vocab_size], + dtype=self.model_config.params_dtype, tensor=logits, pp_group=self.pp_group, ) @@ -1248,7 +1244,7 @@ def generate_all_output_tokens_static_batch( sampled_logits = self.sample_from_logits( last_token_logits, sampling_params, - vocab_size, + self.vocab_size, generation_started=generation_started, top_n_logprobs_dict=top_n_logprobs_dict, logits=logits_for_top_n_prompt_logprobs, diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index e287344c13d..f44aed613e7 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -661,7 +661,7 @@ def _postprocess( ) sequence_parallel_override = False - if in_inference_mode and inference_context.materialize_only_last_token_logits: + if in_inference_mode and inference_context.config.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -691,7 +691,7 @@ def _postprocess( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.materialize_only_last_token_logits + and inference_context.config.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 8d45e1d0147..6d43f5583df 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -267,7 +267,7 @@ def forward( output_weight = self.shared_embedding_or_output_weight() sequence_parallel_override = False - if in_inference_mode and inference_context.materialize_only_last_token_logits: + if in_inference_mode and inference_context.config.materialize_only_last_token_logits: if inference_context.is_static_batching(): hidden_states = hidden_states[-1:, :, :] else: @@ -297,7 +297,7 @@ def forward( assert ( in_inference_mode and inference_context.is_dynamic_batching() - and inference_context.materialize_only_last_token_logits + and inference_context.config.materialize_only_last_token_logits ) self.output_layer.sequence_parallel = True diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index 2494126d3ab..bc5ad42d005 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -194,6 +194,9 @@ def _should_call_local_cudagraph(self, *args, **kwargs): and kwargs.get('inference_context') is not None and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ): - using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step() + context = kwargs['inference_context'] + using_cuda_graph = (context.is_static_batching() and context.is_decode_only()) or ( + not context.is_static_batching() and context.using_cuda_graph_this_step() + ) return using_cuda_graph return False diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index eaae585905e..48b04c35134 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -696,6 +696,12 @@ class TransformerConfig(ModelParallelConfig): the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False.""" + moe_pad_experts_for_cuda_graph_inference: bool = False + """moe_pad_experts_for_cuda_graph_inference (bool): If True, the router will switch to dropping + and padding during decode time which does not have a D2H sync. The capacity factor is set to the + max that an expert could see during inference so no tokens are actually dropped. The default + setting is False.""" + moe_token_drop_policy: Literal['probs', 'position'] = "probs" """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will @@ -830,6 +836,9 @@ class TransformerConfig(ModelParallelConfig): which is no use of symmetric memory. """ + nccl_all_reduce_for_prefill: bool = False + """If True, use NCCL all-reduce kernels when symmetric all-reduce is enabled.""" + use_inference_optimized_layers: bool = False """If True, use inference optimized transformer layers during inference.""" diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 467f4ff3cce..636c76f2a84 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -497,17 +497,6 @@ def divide(numerator, denominator): return numerator // denominator -def deprecate_inference_params(inference_context, inference_params): - """Print warning for deprecated `inference_params`.""" - if inference_context is None and inference_params is not None: - warnings.warn( - "`inference_params` renamed to `inference_context`, and will be " - "removed in `megatron-core` 0.13." - ) - return inference_params - return inference_context - - def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_initialized=True): """Issue a deprecation warning if tp_group is None and return the default tp group.""" # TODO(zijiey): remove this function later. @@ -2406,25 +2395,6 @@ async def wrapper(*args, **kwargs): return _decorate if func is None else _decorate(func) -def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]: - """Returns Mamba inference state config from the model if it is a hybrid model.""" - from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, - ) - from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols - - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if layer_type_list is not None and Symbols.MAMBA in layer_type_list: - (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() - return MambaInferenceStateConfig( - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, - ) - return None - - # ============================================================================ # Backward Compatibility Decorators # ============================================================================ @@ -2559,3 +2529,43 @@ class ExperimentalModel: """ func._experimental_api = True return func + + +def deprecate_args( + *deprecated_keys, message="Argument '{name}' has been deprecated and should not be used." +): + """ + Intercepts specific keyword arguments to raise a custom TypeError. + + Args: + *deprecated_keys: Strings representing the argument names to block. + message: Custom error message string. Use {name} as a placeholder. + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check if any deprecated key is present in kwargs + found_deprecated = set(deprecated_keys) & set(kwargs.keys()) + + if found_deprecated: + bad_key = list(found_deprecated)[0] + raise TypeError(message.format(name=bad_key)) + + # Send args to the real function + return func(*args, **kwargs) + + return wrapper + + return decorator + + +def deprecate_inference_params(inference_context, inference_params): + """Print warning for deprecated `inference_params`.""" + if inference_context is None and inference_params is not None: + warnings.warn( + "`inference_params` renamed to `inference_context`, and will be " + "removed in `megatron-core` 0.13." + ) + return inference_params + return inference_context diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py new file mode 100644 index 00000000000..26496bfed70 --- /dev/null +++ b/megatron/inference/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py new file mode 100644 index 00000000000..145af726c4f --- /dev/null +++ b/megatron/inference/utils.py @@ -0,0 +1,320 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import logging +from argparse import ArgumentParser +from functools import partial +from typing import Optional + +from gpt_builders import gpt_builder +from mamba_builders import mamba_builder +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig +from megatron.core.inference.contexts import DynamicInferenceContext +from megatron.core.inference.engines import DynamicInferenceEngine +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( + GPTInferenceWrapper, +) +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, +) +from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_attr_wrapped_model, log_single_rank +from megatron.training import get_args +from megatron.training import get_model as _get_model +from megatron.training import get_tokenizer, get_wandb_writer +from megatron.training.checkpointing import load_checkpoint +from model_provider import model_provider + +logger = logging.getLogger(__name__) + + +def get_model_for_inference() -> MegatronModule: + """Initialize model and load checkpoint for inference.""" + + args = get_args() + + if args.model_provider == "gpt": + model_builder = gpt_builder + elif args.model_provider == "mamba": + model_builder = mamba_builder + else: + raise ValueError(f"Invalid model provider {args.model_provider}") + + # Build model. + model = _get_model(partial(model_provider, model_builder), wrap_with_ddp=False) + + # Load checkpoint. + assert args.load is not None + args.exit_on_missing_checkpoint = True + load_checkpoint( + ddp_model=model, + optimizer=None, + opt_param_scheduler=None, + strict=not args.inference_ckpt_non_strict, + ) + + # No virtual PP. + assert len(model) == 1, "Above condition should have caught this" + model = model[0] + + # Eval mode. + model.eval() + + return model + + +def add_inference_args(parser: ArgumentParser) -> ArgumentParser: + """Add inference command line arguments to the parser.""" + + group = parser.add_argument_group(title='Inference') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument( + "--return-log-probs", + action='store_true', + default=False, + help='Return the log probabilities of the final output tokens', + ) + group.add_argument( + "--prompts", + metavar='N', + type=str, + nargs='+', + help='Input prompts with each prompt within quotes and seperated by space', + ) + group.add_argument( + "--num-tokens-to-prompt", + type=int, + nargs="+", + default=[64, 1024], + help='Number of tokens to use for simulated prompts. This should be a ' + 'space-separated pair of integers, and the generated prompt lengths will ' + 'be uniformly sampled within this range.', + ) + group.add_argument( + "--num-tokens-to-generate", + type=int, + default=30, + help='Number of tokens to generate for each prompt', + ) + group.add_argument( + "--num-tokens-from-file", + action='store_true', + default=False, + help='Use per-prompt num_tokens_to_generate from prompt file', + ) + group.add_argument( + "--top-n-logprobs", + type=int, + default=0, + help=( + "Return the top n logprobs for the generated tokens and their " + "corresponding token as a dictionary" + ), + ) + group.add_argument( + "--incoming-requests-per-step", + type=int, + default=None, + help="Add a deterministic number of requests per step. This arg is " + "prioritized over `--incoming-requests-per-sec` below (which is non-" + "deterministic). Note that the number of requests added per step is " + "additionally limited by the inference context's `max_requests`, " + "`max_tokens`, and KV buffer size.", + ) + group.add_argument( + "--incoming-requests-per-sec", + type=float, + default=100.0, + help="Simulated number of requests per second. Set to -1 to add all requests together.", + ) + group.add_argument( + "--incoming-requests-duration", + type=float, + default=10.0, + help="Total amount of time to simulate that requests are " + "arriving. Multiply this value with " + "`--incoming-requests-per-sec` to get the approximate " + "total number of requests. Set to -1 to add all requests together.", + ) + group.add_argument( + "--model-provider", choices=["mamba", "gpt"], default="gpt", help="Model provider" + ) + group.add_argument( + "--skip-prompt-log-probs", action='store_true', default=False, help='Skip prompt log probs.' + ) + group.add_argument( + "--stop-words", + metavar='WORD', + type=str, + nargs='+', + default=None, + help='Stop words to terminate generation. Each word should be quoted and ' + 'separated by space. Example: --stop-words "\\n\\n" "END" "###"', + ) + group.add_argument( + "--output-path", type=str, default=None, help="Path to save generations as JSON" + ) + group.add_argument( + "--output-every-n-results", + type=int, + default=1, + help="To minimize the output file size of larger runs, only write the " + "results of every `n` requests.", + ) + group.add_argument( + "--prompt-file", + help='Jsonl file containing input prompts, where each item (i.e., line) ' + 'contains the field \'text\' where the value is the prompt. All other ' + 'fields within each item are ignored, and may be customized for each ' + 'application.', + ) + group.add_argument( + "--prompt-file-num-truncate", + type=int, + help='Number of samples to use from the loaded prompt file (see ' + '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' + 'will be used, in order.', + ) + group.add_argument( + "--use-flashinfer-fused-rope", + action='store_true', + default=False, + help='Use flashinfer fused rope implementation.', + ) + group.add_argument( + "--no-record-throughput", + action='store_false', + dest="record_throughput", + help="Disable throughput recording in --output-file", + ) + group.add_argument( + "--inference-ckpt-non-strict", + action="store_true", + help="Load checkpoint with `strict=False`.", + ) + group.add_argument( + "--termination-id", + type=int, + default=None, + help="Termination ID that overrides `tokenizer.eod`.", + ) + group.add_argument( + "--suspend-resume-interval", + type=int, + default=None, + help="Suspend and resume the dynamic engine every " + "`suspend_resume_interval` steps. This is used to tet the suspend/resume " + "system.", + ) + group.add_argument( + "--inference-repeat-n", + type=int, + default=1, + help="Repeat inference iterations N times for benchmarking.", + ) + group.add_argument( + "--throughput-check-only", + action='store_true', + default=False, + help="If true, only run throughput check without verifying outputs.", + ) + + return parser + + +def get_inference_config_from_model_and_args(model: MegatronModule, args): + """Returns a `InferenceConfig` constructed from the model and command line arguments.""" + + # Max sequence length. + position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type") + model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length") + inf_max_seq_len = args.inference_max_seq_length + max_batch_size = args.inference_dynamic_batching_max_requests + + if position_embedding_type == "learned_absolute": + # When using absolute position embeddings, it is critical that the + # context's `max_sequence_length` is less than or equal to the model's + # `max_sequence_length`. Otherwise, the context's `position_ids` will + # contain ids greater than the dimension of the position embedding + # tensor, which will result in an index error. + if inf_max_seq_len: + max_sequence_length = min(model_max_seq_len, inf_max_seq_len) + else: + max_sequence_length = model_max_seq_len + assert max_batch_size is None or max_batch_size <= model_max_seq_len + else: + max_sequence_length = inf_max_seq_len + if args.inference_dynamic_batching_max_requests is not None: + max_sequence_length = max(max_sequence_length, max_batch_size) + + mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) + pg_collection = get_attr_wrapped_model(model, "pg_collection") + + # Get inference logging configuration from args + log_inference_wandb = args.inference_wandb_logging + inference_logging_step_interval = args.inference_logging_step_interval + + # Get metrics writer if logging is enabled and on the logging rank + # Use the same rank convention as training (last rank logs) + metrics_writer = None + if ( + inference_logging_step_interval > 0 + and log_inference_wandb + and args.rank == (args.world_size - 1) + ): + metrics_writer = get_wandb_writer() + if metrics_writer is None: + log_single_rank( + logger, + logging.WARNING, + "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " + "wandb module is available. Inference logging will be disabled.", + ) + + return InferenceConfig( + block_size_tokens=args.inference_dynamic_batching_block_size, + buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, + paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs + if args.cuda_graph_impl == "local" + else None + ), + max_requests=args.inference_dynamic_batching_max_requests, + max_tokens=args.inference_dynamic_batching_max_tokens, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + offload_kv_cache=args.rl_offload_kv_cache_during_training, + cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, # pylint: disable=line-too-long + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + persist_cuda_graphs=args.rl_training_cuda_graphs, + max_sequence_length=max_sequence_length, + mamba_inference_state_config=mamba_inference_state_config, + pg_collection=pg_collection, + use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, + materialize_only_last_token_logits=not args.return_log_probs, + track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, + enable_chunked_prefill=args.enable_chunked_prefill, + metrics_writer=metrics_writer, + logging_step_interval=args.inference_logging_step_interval, + ) + + +def get_dynamic_inference_engine(model: Optional[MegatronModule] = None) -> DynamicInferenceEngine: + """Builds a `DynamicInferenceEngine`.""" + args = get_args() + if model is None: + model = get_model_for_inference() + if args.legacy_tokenizer: + tokenizer = get_tokenizer() + else: + tokenizer = build_tokenizer(args) + + inference_config = get_inference_config_from_model_and_args(model, args) + context = DynamicInferenceContext(model.config, inference_config) + inference_wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(inference_wrapped_model, tokenizer) + engine = DynamicInferenceEngine(controller, context) + return engine diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py index 4e9364b3ae9..602ff4f7450 100644 --- a/megatron/rl/inference/megatron.py +++ b/megatron/rl/inference/megatron.py @@ -7,7 +7,6 @@ import torch.distributed as dist from pydantic import PrivateAttr -from megatron.core import parallel_state from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine @@ -16,23 +15,13 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import ( - get_attr_wrapped_model, - get_mamba_inference_state_config_from_model, - get_pg_size, - log_single_rank, -) +from megatron.core.utils import get_attr_wrapped_model, log_single_rank from megatron.training import get_wandb_writer from megatron.training.global_vars import get_args, get_tokenizer @@ -66,134 +55,20 @@ def get_static_inference_engine(args: Namespace, model: MegatronModule) -> Abstr """ tokenizer = get_tokenizer() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_seq_length=args.inference_max_seq_length, - inference_max_requests=( - args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 - ), - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - ) - - inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) + inference_wrapped_model = GPTInferenceWrapper(model) pg_collection = get_attr_wrapped_model(model, "pg_collection") pp_group = pg_collection.pp - text_generation_controller = SimpleTextGenerationController( - inference_wrapped_model=inference_wrapped_model, - tokenizer=tokenizer, - pp_group=pp_group, + text_generation_controller = TextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer, pp_group=pp_group ) return MCoreEngine( text_generation_controller=text_generation_controller, max_batch_size=( - args.inference_max_batch_size if args.inference_max_batch_size is not None else 1 + args.inference_max_requests if args.inference_max_requests is not None else 1 ), ) -## This code is copied from tools/run_text_generation_server.py -def get_dynamic_inference_engine( - args: Namespace, - model: MegatronModule, - inference_logging_step_interval: int = 0, - metrics_writer = None -) -> AbstractEngine: - """Get the relevant backend for running inference. - - This function will automatically choose the TRTLLMBackend when possible, - and default to Mcore backend if the user does not specify any backends. - TRTLLMBackend is not implmented yet. - - Args: - args (Namespace): The user arguments parsed from command line - model (MegatronModule): The megatron model. - inference_logging_step_interval (int): Step interval for logging inference metrics. - metrics_writer: Metrics writer (wandb module) for logging. - - Returns: - AbstractBackend: The chosen backend - """ - tokenizer = get_tokenizer() - - enable_cuda_graph = args.cuda_graph_impl == "local" - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - - # DynamicInferenceContext must use the inference model's TP / PP size, not the - # training TP / PP size from global args. The inference model may have a custom - # ProcessGroupCollection with a different TP / PP size. - pg_collection = get_attr_wrapped_model(model, "pg_collection") - tp_group = getattr(pg_collection, 'tp', None) if pg_collection is not None else None - if tp_group is not None: - inference_tp_size = get_pg_size(tp_group) - else: - inference_tp_size = args.tensor_model_parallel_size - pp_group = getattr(pg_collection, 'pp', None) if pg_collection is not None else None - if pp_group is not None: - inference_pp_size = get_pg_size(pp_group) - else: - inference_pp_size = args.pipeline_model_parallel_size - - # Inference context. - inference_context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers // inference_pp_size, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs if enable_cuda_graph else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - max_requests=args.inference_dynamic_batching_max_requests, - max_tokens=args.inference_dynamic_batching_max_tokens, - pg_collection=pg_collection, # TP/PP sizes are derived from the model's pg_collection. - materialize_only_last_token_logits=True, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=None, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens, - cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count, - metrics_writer=metrics_writer, - persist_cuda_graphs=args.rl_training_cuda_graphs, - offload_kv_cache=args.rl_offload_kv_cache_during_training - ) - - inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context, pg_collection=pg_collection) - - inference_wrapped_model.model_is_pipeline_parallel = not ( - is_pp_first_stage(pg_collection.pp) and is_pp_last_stage(pg_collection.pp) - ) - - pp_group = getattr(pg_collection, "pp", None) - text_generation_controller = SimpleTextGenerationController( - inference_wrapped_model=inference_wrapped_model, - tokenizer=tokenizer, - pp_group=pp_group, - ) - - return DynamicInferenceEngine( - controller=text_generation_controller, - context=inference_context, - random_seed=args.seed, - track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events, - enable_chunked_prefill=not args.disable_chunked_prefill, - inference_logging_step_interval=inference_logging_step_interval, - pg_collection=pg_collection, - ) - - class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): """Interface to use MCoreEngine directly as an inference engine.""" @@ -246,6 +121,9 @@ async def base_generate(self, request: InferenceRequest): @classmethod async def launch(cls, model: GPTModel, **kwargs): + # Import here to avoid circular imports + from megatron.inference.utils import get_dynamic_inference_engine + args = get_args() tokenizer = get_tokenizer() @@ -256,30 +134,7 @@ async def launch(cls, model: GPTModel, **kwargs): "WARNING: Tokenizer has no BOS token so prompt will not have BOS token", ) - # Get inference logging configuration from args - log_inference_wandb = args.inference_wandb_logging - inference_logging_step_interval = args.inference_logging_step_interval - - # Get metrics writer if logging is enabled and on the logging rank - # Use the same rank convention as training (last rank logs) - metrics_writer = None - if ( - inference_logging_step_interval > 0 - and log_inference_wandb - and args.rank == (args.world_size - 1) - ): - metrics_writer = get_wandb_writer() - if metrics_writer is None: - log_single_rank( - logger, - logging.WARNING, - "WARNING: --rl-inference-logging-step-interval is set but no metrics writer " - "wandb module is available. Inference logging will be disabled.", - ) - - inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine( - args, model, inference_logging_step_interval, metrics_writer - ) + inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(model=model) dp_addr = await inference_engine.start_listening_to_data_parallel_coordinator( inference_coordinator_port=41521, launch_inference_coordinator=True, ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 28603b02ed5..131392d2c3d 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1453,13 +1453,10 @@ def _add_inference_args(parser): dest='use_legacy_static_engine') group.add_argument('--inference-max-requests', type=int, default=8, help='Maximum number of requests for inference.', - dest='inference_max_batch_size') + dest='inference_max_requests') group.add_argument('--inference-max-seq-length', type=int, default=2560, help='Maximum sequence length expected for inference (prefill + decode).', dest='inference_max_seq_length') - group.add_argument('--inference-max-batch-size', type=int, default=None, - help='Maximum batch size for inference.', - dest='inference_max_batch_size') group.add_argument('--inference-dynamic-batching', action='store_true', default=False, help='Enable dynamic batching mode.') @@ -1515,15 +1512,10 @@ def _add_inference_args(parser): '1) allocate `memory_buffer` in unified memory. ' 'Eventually, additional levels will be included to ' 'control other tensors within the context.') - group.add_argument('--nccl-all-reduce-for-prefill', - action='store_true', default=False, - help='When using symmeric all reduce kernels this will use regular nccl kernels for prefill. This can be more effecient when prefill is large as the nccl kernels can be more bandwith optimized') # TODO(ksanthanam): Clean this up in future PR - group.add_argument('--enable-chunked-prefill', dest='disable_chunked_prefill', - action='store_false', default=True, + group.add_argument('--enable-chunked-prefill', dest='enable_chunked_prefill', + action='store_true', default=False, help="Enable chunked prefill (disabled by default)") - group.add_argument('--disable-chunked-prefill', dest='disable_chunked_prefill', - action='store_true', help=argparse.SUPPRESS) group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens', type=int, default=16384, help='Maximum number of tokens to capture in a cuda graph.') @@ -2722,10 +2714,6 @@ def _add_moe_args(parser): group.add_argument('--moe-upcycling-granularity', type=int, default=1, help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. ' 'For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.') - group.add_argument('--moe-pad-experts-for-cuda-graph-inference', action='store_true', - help="some MoE routers have a D2H sync that will break cuda graphs. If this flag is set the router will switch" \ - " to dropping and padding during decode time which does not have a D2H sync. The capacity factor is set to the" \ - " max that an expert could see during inference so no tokens are actually dropped.") return parser def _add_mla_args(parser): diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml index 6d63b0e4228..40b45024cb1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml @@ -76,6 +76,7 @@ MODEL_ARGS: --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompt-file: ./tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/test_prompts.jsonl --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-logging-step-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml index 8f54fff0a2f..9a47281703a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml @@ -48,6 +48,7 @@ MODEL_ARGS: --inference-max-requests: 1 --dist-ckpt-strictness: log_unexpected --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-logging-step-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml index 1a1195baa2b..99bcc433ad1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml @@ -53,6 +53,7 @@ MODEL_ARGS: --inference-max-requests: 1 --dist-ckpt-strictness: log_unexpected --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-logging-step-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index be00e4b3ce7..1c78b466b1e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -44,6 +44,7 @@ MODEL_ARGS: --flash-decode: true --dist-ckpt-strictness: log_unexpected --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. METRICS: diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 549821afc8b..1c22a729f6e 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -79,6 +79,7 @@ MODEL_ARGS: --inference-max-seq-length: 4096 --inference-max-requests: 1 --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 METRICS: diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml index 4934fe6c913..03895d97ee9 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml @@ -72,6 +72,7 @@ MODEL_ARGS: --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-dynamic-batching-buffer-size-gb: 20 diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 69c0db980b0..9259d63c9d1 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -74,6 +74,7 @@ MODEL_ARGS: --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 --output-path: ${INFERENCE_OUTPUT_PATH} + --use-legacy-static-engine: true --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. METRICS: diff --git a/tests/test_utils/recipes/h100/gpt-static-inference.yaml b/tests/test_utils/recipes/h100/gpt-static-inference.yaml index 806762531fd..87046588b2b 100644 --- a/tests/test_utils/recipes/h100/gpt-static-inference.yaml +++ b/tests/test_utils/recipes/h100/gpt-static-inference.yaml @@ -63,15 +63,15 @@ products: - test_case: [gpt_static_inference_tp1_pp1_583m_cudagraphs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/h100/moe-static-inference.yaml b/tests/test_utils/recipes/h100/moe-static-inference.yaml index f10d293e953..fdab3ff430b 100644 --- a/tests/test_utils/recipes/h100/moe-static-inference.yaml +++ b/tests/test_utils/recipes/h100/moe-static-inference.yaml @@ -58,15 +58,15 @@ products: - test_case: [gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github-broken] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github-broken] platforms: [dgx_h100] - test_case: [gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr-broken] + scope: [mr, mr-github-broken] platforms: [dgx_h100] diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index 05e0306bfd8..f3ef0910f58 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -1,14 +1,13 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import contextlib import math import pytest import torch from megatron.core import parallel_state -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts.dynamic_context import ( DynamicInferenceContext, RequestOverflowError, @@ -18,14 +17,21 @@ from megatron.core.inference.sampling_params import SamplingParams from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils -def set_rounder(value): - """Utility function to set the DynamicInferenceContext rounder.""" - DynamicInferenceContext.ROUNDER = value # For backwards compatibility - DynamicInferenceContext.TOKEN_ROUNDER = value - DynamicInferenceContext.REQUEST_ROUNDER = value +@contextlib.contextmanager +def rounder_override(n): + original_token_rounder = DynamicInferenceContext.TOKEN_ROUNDER + original_request_rounder = DynamicInferenceContext.REQUEST_ROUNDER + try: + DynamicInferenceContext.TOKEN_ROUNDER = n + DynamicInferenceContext.REQUEST_ROUNDER = n + yield + finally: + DynamicInferenceContext.TOKEN_ROUNDER = original_token_rounder + DynamicInferenceContext.REQUEST_ROUNDER = original_request_rounder class TestDynamicContext: @@ -52,11 +58,8 @@ def _get_dynamic_context( max_tokens, is_hybrid_model=False, layer_type_list=None, - rounder=64, paused_buffer_size_gb=None, ): - set_rounder(rounder) - if is_hybrid_model: if layer_type_list is None: layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] @@ -69,23 +72,27 @@ def _get_dynamic_context( mamba_inference_state_config = None dynamic_context = DynamicInferenceContext( - params_dtype=params_dtype, - num_layers=num_layers // self.pp_size, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=buffer_size_gb, - paused_buffer_size_gb=( - 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb + model_config=TransformerConfig( + params_dtype=params_dtype, + num_layers=num_layers, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=buffer_size_gb, + paused_buffer_size_gb=( + 0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb + ), + block_size_tokens=block_size_tokens, + max_tokens=max_tokens, + mamba_inference_state_config=mamba_inference_state_config, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ), - block_size_tokens=block_size_tokens, - max_tokens=max_tokens, - mamba_inference_state_config=mamba_inference_state_config, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM ) return dynamic_context @@ -93,6 +100,7 @@ def teardown_method(self, method): Utils.destroy_model_parallel() @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_initialize_dynamic_context(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -107,7 +115,6 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool): block_size_tokens=128, max_tokens=None, is_hybrid_model=is_hybrid_model, - rounder=64, ) if not is_hybrid_model: @@ -145,6 +152,7 @@ def test_is_static_batching(self): assert not dynamic_context.is_static_batching() @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_is_memory_available(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -168,6 +176,7 @@ def test_is_memory_available(self, is_hybrid_model): assert not dynamic_context.block_allocator.is_memory_available(1) @pytest.mark.internal + @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_request_overflow(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -181,7 +190,6 @@ def test_request_overflow(self, is_hybrid_model: bool): buffer_size_gb=0.01, block_size_tokens=32, max_tokens=None, - rounder=1, is_hybrid_model=is_hybrid_model, ) dynamic_context.max_requests //= 2 @@ -198,6 +206,7 @@ def test_request_overflow(self, is_hybrid_model: bool): ) # Adding more than allowed requests @pytest.mark.internal + @rounder_override(1) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_token_overflow_error(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -211,7 +220,6 @@ def test_token_overflow_error(self, is_hybrid_model: bool): buffer_size_gb=0.1, block_size_tokens=128, max_tokens=200, # setting low, but >= context.max_requests. - rounder=1, is_hybrid_model=is_hybrid_model, ) @@ -227,6 +235,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool): ) # Exceeding max token count @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_reset(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -301,6 +310,7 @@ def test_reset(self, is_hybrid_model: bool): assert torch.all(dynamic_context.mamba_metadata.request_to_mamba_state_idx == -1) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_allocate_and_release_memory_blocks(self, is_hybrid_model): self._setup_model_parallel_group(1, 1) @@ -349,6 +359,7 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): ) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_add_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -422,6 +433,7 @@ def test_add_request(self, is_hybrid_model: bool): ) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_populates_state(self): self._setup_model_parallel_group(1, 1) @@ -520,6 +532,7 @@ def test_add_dummy_requests_parallel_populates_state(self): ) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): self._setup_model_parallel_group(1, 1) @@ -550,6 +563,7 @@ def test_add_dummy_requests_parallel_hybrid_allocates_mamba(self): assert torch.all(dynamic_context.mamba_ssm_states[:, mamba_idx] == 0) @pytest.mark.internal + @rounder_override(64) def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): self._setup_model_parallel_group(1, 1) @@ -575,6 +589,7 @@ def test_add_dummy_requests_parallel_decode_does_not_count_as_prefill(self): assert dynamic_context.num_prefill_requests == 0 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_update_request(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -774,6 +789,7 @@ def test_update_request(self, is_hybrid_model: bool): ) @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): """Test that memory blocks are correctly released for finished requests.""" @@ -846,6 +862,7 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): assert mamba_idx[4] == -1 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): """Test that all memory blocks are correctly released for finished requests that use multiple blocks.""" @@ -913,6 +930,7 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 6 @pytest.mark.internal + @rounder_override(64) @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_mamba_states_cache(self, is_hybrid_model: bool): self._setup_model_parallel_group(1, 1) @@ -988,6 +1006,7 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): assert torch.all(ssm_state_layer3 == 40.0) @pytest.mark.internal + @rounder_override(64) def test_calculate_and_store_log_probs(self): self._setup_model_parallel_group(1, 1) dynamic_context = self._get_dynamic_context( @@ -1205,6 +1224,7 @@ def test_calculate_and_store_log_probs(self): current_global_token_offset += expected_len @pytest.mark.internal + @rounder_override(64) def test_pipeline_parallel_uneven_layers(self): """ Test that DynamicInferenceContext synchronizes the total block count across @@ -1215,23 +1235,39 @@ def test_pipeline_parallel_uneven_layers(self): rank = parallel_state.get_pipeline_model_parallel_rank() + mamba_conv_states_shape = (544, 4) + mamba_ssm_states_shape = (8, 64, 16) + if rank == 0: - local_num_layers = 12 + mamba_inference_state_config = MambaInferenceStateConfig( + [Symbols.MAMBA] + [Symbols.ATTENTION] * 4, + mamba_conv_states_shape, + mamba_ssm_states_shape, + ) else: - local_num_layers = 4 + mamba_inference_state_config = MambaInferenceStateConfig( + [Symbols.MAMBA] * 4 + [Symbols.ATTENTION], + mamba_conv_states_shape, + mamba_ssm_states_shape, + ) context = DynamicInferenceContext( - params_dtype=torch.float32, - num_layers=local_num_layers, - kv_channels=64, - num_attention_heads=8, - max_sequence_length=128, - buffer_size_gb=0.1, - block_size_tokens=16, - max_tokens=1024, - pipeline_model_parallel_size=pp_size, - tensor_model_parallel_size=1, - unified_memory_level=0, + model_config=TransformerConfig( + params_dtype=torch.float32, + num_layers=10, + kv_channels=64, + num_attention_heads=8, + pipeline_model_parallel_size=pp_size, + tensor_model_parallel_size=1, + pipeline_dtype=torch.float32, + ), + inference_config=InferenceConfig( + max_sequence_length=128, + buffer_size_gb=0.1, + block_size_tokens=16, + max_tokens=1024, + unified_memory_level=0, + ), ) # Collect the total block counts on each rank diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index d5803b3638e..2e935cab4bd 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -13,9 +13,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts.dynamic_context import ( ActiveRequestCountOverflowError, BlockOverflowError, @@ -28,9 +26,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -48,11 +43,7 @@ from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import ( - get_mamba_inference_state_config_from_model, - is_fa_min_version, - is_te_min_version, -) +from megatron.core.utils import is_fa_min_version, is_te_min_version from tests.unit_tests.test_utilities import Utils @@ -223,26 +214,22 @@ def _build_inference_context( # Inference context. context = DynamicInferenceContext( - params_dtype=transformer_config.params_dtype, - num_layers=transformer_config.num_layers - // transformer_config.pipeline_model_parallel_size, - kv_channels=transformer_config.kv_channels, - num_attention_heads=transformer_config.num_query_groups, - max_sequence_length=test_config.max_sequence_length, - num_cuda_graphs=test_config.num_cuda_graphs, - use_cuda_graphs_for_non_decode_steps=True, - buffer_size_gb=test_config.context_buffer_size_gb, - paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, - block_size_tokens=test_config.context_block_size_tokens, - max_requests=test_config.context_max_requests, - max_tokens=test_config.context_max_tokens, - tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, - mamba_inference_state_config=mamba_inference_state_config, - materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM + model_config=transformer_config, + inference_config=InferenceConfig( + max_sequence_length=test_config.max_sequence_length, + num_cuda_graphs=test_config.num_cuda_graphs, + use_cuda_graphs_for_non_decode_steps=True, + buffer_size_gb=test_config.context_buffer_size_gb, + paused_buffer_size_gb=test_config.context_paused_buffer_size_gb, + block_size_tokens=test_config.context_block_size_tokens, + max_requests=test_config.context_max_requests, + max_tokens=test_config.context_max_tokens, + mamba_inference_state_config=mamba_inference_state_config, + materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM + ), ) return context @@ -382,17 +369,7 @@ def _build_test_env(cls, test_config): model.eval() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - - # Inference config. - inference_config = InferenceWrapperConfig( - hidden_size=transformer_config.hidden_size, - inference_batch_times_seqlen_threshold=400, - fp32_residual_connection=False, - params_dtype=transformer_config.params_dtype, - fp8=transformer_config.fp8, - padded_vocab_size=test_config.vocab_size, - ) + mamba_inference_state_config = MambaInferenceStateConfig.from_model(model) # Inference context. inference_context = cls._build_inference_context( @@ -403,7 +380,7 @@ def _build_test_env(cls, test_config): ) # Inference model wrapper. - inference_wrapped_model = GPTInferenceWrapper(model, inference_config, inference_context) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference(). inference_wrapped_model.model_is_pipeline_parallel = not ( @@ -424,13 +401,7 @@ def _build_test_env(cls, test_config): CudaGraphManager.global_mempool = None # Inference engine. - engine = DynamicInferenceEngine( - text_generation_controller, - inference_context, - random_seed=test_config.random_seed, - enable_cuda_graph=transformer_config.cuda_graph_impl == "local", - enable_chunked_prefill=test_config.enable_chunked_prefill, - ) + engine = DynamicInferenceEngine(text_generation_controller, inference_context) # Test env. env = DynamicEngineTestEnv(config=test_config, requests=requests, engine=engine) diff --git a/tests/unit_tests/inference/engines/test_static_engine.py b/tests/unit_tests/inference/engines/test_static_engine.py index 03b3712e39a..483a21d13bd 100644 --- a/tests/unit_tests/inference/engines/test_static_engine.py +++ b/tests/unit_tests/inference/engines/test_static_engine.py @@ -20,9 +20,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -85,20 +82,11 @@ def setup_engine( ).cuda() gpt_model.to(inference_config_params_dtype) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.hidden_size, - inference_batch_times_seqlen_threshold=400, - inference_max_requests=self.batch_size, - fp32_residual_connection=False, - params_dtype=inference_config_params_dtype, - padded_vocab_size=self.vocab_size, + inference_context = StaticInferenceContext( + max_batch_size=self.batch_size, max_sequence_length=self.sequence_length ) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) self.mock_tokenizer = mock.Mock() # Set required tokenizer attributes before engine creation self.mock_tokenizer.vocab_size = self.vocab_size @@ -200,8 +188,6 @@ def test_generate_dynamic(self, batch_size: int, num_trials: int, empty_prompt: assert len(results) == batch_size for result in results: - if isinstance(result, DynamicInferenceRequestRecord): - result = result.merge() assert isinstance(result, InferenceRequest), ( "expected ; found <%s>." % type(result).__name__ ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py index 07afebe1067..d7ddaa1e680 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py @@ -10,9 +10,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, @@ -53,27 +50,15 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): post_process=parallel_state.is_pipeline_last_stage(), ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=20, - inference_max_requests=self.batch_size, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) + inference_context = StaticInferenceContext(self.batch_size, self.sequence_length) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) - - self.inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + self.inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) def teardown_method(self, method): Utils.destroy_model_parallel() - # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_small_input_batch() @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) - def test_inference_pipeline_parallel_small_size(self, materialize_only_last_token_logits): + def test_inference_pipeline_parallel(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) batch_prompt_tokens = ( @@ -82,7 +67,7 @@ def test_inference_pipeline_parallel_small_size(self, materialize_only_last_toke .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) @@ -107,42 +92,6 @@ def test_inference_pipeline_parallel_small_size(self, materialize_only_last_toke self.vocab_size, ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" - # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_large_input_batch() - @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) - def test_inference_pipeline_parallel_large_size(self, materialize_only_last_token_logits): - self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2) - - batch_prompt_tokens = ( - torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length)) - .int() - .cuda() - ) - self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( - materialize_only_last_token_logits - ) - - inference_input = self.inference_wrapped_model.prep_inference_input( - prompts_tokens=batch_prompt_tokens - ) - - inference_input_for_context_window = ( - self.inference_wrapped_model.get_batch_for_context_window(inference_input, 0, 10) - ) - - logits_seq_len = 1 if materialize_only_last_token_logits else 10 - - logits = self.inference_wrapped_model.run_one_forward_step( - inference_input_for_context_window - ) - - if parallel_state.is_pipeline_last_stage(): - assert logits.shape == ( - self.batch_size, - logits_seq_len, - self.vocab_size, - ), f"Shape mismatch . Expected {(self.batch_size, logits_seq_len, self.vocab_size)}, but got {logits.shape}" - @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False]) def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits): self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1) @@ -153,7 +102,7 @@ def test_inference_only_tensor_parallel(self, materialize_only_last_token_logits .cuda() ) self.inference_wrapped_model.prep_model_for_inference() - self.inference_wrapped_model.inference_context.materialize_only_last_token_logits = ( + self.inference_wrapped_model.inference_context.config.materialize_only_last_token_logits = ( materialize_only_last_token_logits ) diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py index 36d5187b5eb..eb06f6ed78b 100644 --- a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py +++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from argparse import Namespace from copy import deepcopy from unittest import mock @@ -7,9 +9,6 @@ from megatron.core import parallel_state from megatron.core.inference.contexts import StaticInferenceContext -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -77,19 +76,9 @@ def setup_model(self, tensor_parallel_size, pipeline_parallel_size): add_decoder=True, ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - self.inference_wrapped_model = T5InferenceWrapper( - t5_model, inference_wrapper_config, inference_context - ) + self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) def teardown_method(self, method): Utils.destroy_model_parallel() diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py deleted file mode 100644 index 794634760d0..00000000000 --- a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py +++ /dev/null @@ -1,21 +0,0 @@ -import torch - -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) - - -class TestModelInferenceWrapperConfig: - - def test_inference_config(self): - inference_config = InferenceWrapperConfig( - hidden_size=10, - inference_batch_times_seqlen_threshold=10, - padded_vocab_size=10, - params_dtype=torch.float, - fp32_residual_connection=False, - ) - inference_config.add_attributes({"abc": 45}) - assert ( - inference_config.abc == 45 - ), f"min tokens not set correctly. it is {inference_config.min_tokens}" diff --git a/tests/unit_tests/inference/test_inference_config.py b/tests/unit_tests/inference/test_inference_config.py new file mode 100644 index 00000000000..6d58328dade --- /dev/null +++ b/tests/unit_tests/inference/test_inference_config.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import dataclasses + +from megatron.core.inference.config import InferenceConfig +from megatron.core.transformer.transformer_config import TransformerConfig + + +class TestInferenceConfig: + def test_mutual_exclusivity_with_transformer_config(self): + """ + Ensure mutual exclusivity between fields in `InferenceConfig` and + `TransformerConfig`. + """ + dynamic_inference_config_fields = set(dataclasses.fields(InferenceConfig)) + transformer_config_fields = set(dataclasses.fields(TransformerConfig)) + assert len(dynamic_inference_config_fields.intersection(transformer_config_fields)) == 0 diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py index cab464af503..1417926f13b 100644 --- a/tests/unit_tests/inference/test_wandb_logging.py +++ b/tests/unit_tests/inference/test_wandb_logging.py @@ -7,6 +7,7 @@ import pytest import torch +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -15,6 +16,7 @@ TextGenerationController, ) from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils @@ -50,20 +52,26 @@ def _get_dynamic_context( max_sequence_length=512, buffer_size_gb=0.03, block_size_tokens=128, + logging_step_interval=0, metrics_writer=None, ): """Helper to create a DynamicInferenceContext.""" return DynamicInferenceContext( - params_dtype=params_dtype, - num_layers=num_layers, - kv_channels=kv_channels, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_cuda_graphs=None, - buffer_size_gb=buffer_size_gb, - block_size_tokens=block_size_tokens, - metrics_writer=metrics_writer, - unified_memory_level=0, # unit tests currently broken with UVM + model_config=TransformerConfig( + params_dtype=params_dtype, + num_layers=num_layers, + kv_channels=kv_channels, + num_attention_heads=num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=max_sequence_length, + num_cuda_graphs=None, + buffer_size_gb=buffer_size_gb, + block_size_tokens=block_size_tokens, + unified_memory_level=0, # unit tests currently broken with UVM + logging_step_interval=logging_step_interval, + metrics_writer=metrics_writer, + ), ) @pytest.mark.internal @@ -195,12 +203,14 @@ def test_kvcache_utilization_stats_types(self): @pytest.mark.internal @patch('megatron.core.inference.engines.dynamic_engine.HAVE_WANDB', True) def test_engine_logging_step_interval_zero(self): - """Test that no logging occurs when inference_logging_step_interval is 0.""" + """Test that no logging occurs when logging_step_interval is 0.""" mock_wandb = Mock() mock_wandb.__name__ = "wandb" mock_wandb.log = Mock() - dynamic_context = self._get_dynamic_context(metrics_writer=mock_wandb) + dynamic_context = self._get_dynamic_context( + logging_step_interval=0, metrics_writer=mock_wandb + ) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -210,12 +220,7 @@ def test_engine_logging_step_interval_zero(self): mock_controller.inference_wrapped_model.model.config = Mock() mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" - engine = DynamicInferenceEngine( - controller=mock_controller, - context=dynamic_context, - random_seed=123, - inference_logging_step_interval=0, # Disabled - ) + engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) # Verify log was never called mock_wandb.log.assert_not_called() @@ -225,15 +230,16 @@ def test_paused_requests_in_stats(self): """Test that paused requests are correctly reflected in stats.""" set_rounder(1) dynamic_context = DynamicInferenceContext( - params_dtype=torch.float32, - num_layers=2, - kv_channels=64, - num_attention_heads=8, - max_sequence_length=128, - num_cuda_graphs=None, - buffer_size_gb=0.01, # Small buffer to force pausing - block_size_tokens=32, - unified_memory_level=0, # unit tests currently broken with UVM + model_config=TransformerConfig( + params_dtype=torch.float32, num_layers=2, kv_channels=64, num_attention_heads=8 + ), + inference_config=InferenceConfig( + max_sequence_length=128, + num_cuda_graphs=None, + buffer_size_gb=0.01, # Small buffer to force pausing + block_size_tokens=32, + unified_memory_level=0, # unit tests currently broken with UVM + ), ) # Add multiple requests to potentially trigger pausing @@ -257,7 +263,7 @@ def test_paused_requests_in_stats(self): @pytest.mark.internal def test_metrics_writer_none_handling(self): """Test that engine handles None metrics_writer gracefully.""" - dynamic_context = self._get_dynamic_context(metrics_writer=None) + dynamic_context = self._get_dynamic_context(logging_step_interval=10, metrics_writer=None) # Create mock controller with proper spec to pass isinstance checks mock_controller = create_autospec(TextGenerationController, instance=True) @@ -268,13 +274,8 @@ def test_metrics_writer_none_handling(self): mock_controller.inference_wrapped_model.model.config.cuda_graph_impl = "none" # Should not raise error even with logging interval set - engine = DynamicInferenceEngine( - controller=mock_controller, - context=dynamic_context, - random_seed=123, - inference_logging_step_interval=10, - ) + engine = DynamicInferenceEngine(controller=mock_controller, context=dynamic_context) # Verify engine was created successfully - assert engine.inference_logging_step_interval == 10 - assert engine.context.metrics_writer is None + assert engine.logging_step_interval == 10 + assert engine.metrics_writer is None diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py index 93a208710fc..5bd39ec1324 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import random import string import time @@ -12,9 +14,6 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( T5InferenceWrapper, ) @@ -85,19 +84,9 @@ def setup_method(self, method): add_decoder=True, ).cuda() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - inference_wrapped_model = T5InferenceWrapper( - t5_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = T5InferenceWrapper(t5_model, inference_context) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py similarity index 96% rename from tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py rename to tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py index 0885401e7a0..bdf95c2d9bf 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_text_generation_controller.py @@ -14,6 +14,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts import DynamicInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError from megatron.core.inference.inference_request import ( @@ -24,9 +25,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -100,37 +98,24 @@ def setup_model( if dtype == torch.bfloat16: gpt_model = Float16Module(gpt_model.config, gpt_model) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.hidden_size, - inference_batch_times_seqlen_threshold=-1, - inference_max_seq_length=2048, - inference_max_requests=16 if fp8 else self.batch_size, - fp32_residual_connection=False, - params_dtype=dtype, - padded_vocab_size=self.vocab_size, - ) - if static: - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext( + max_batch_size=16 if fp8 else self.batch_size, max_sequence_length=2048 + ) else: inference_context = DynamicInferenceContext( - params_dtype=dtype, - num_layers=transformer_config.num_layers // pipeline_model_parallel_size, - kv_channels=transformer_config.kv_channels, - num_attention_heads=transformer_config.num_attention_heads, - tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size, - max_sequence_length=2048, - buffer_size_gb=0.2, - materialize_only_last_token_logits=False, - use_flashinfer_fused_rope=None, # default to using flash-infer if available - # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM + model_config=transformer_config, + inference_config=InferenceConfig( + max_sequence_length=2048, + buffer_size_gb=0.2, + materialize_only_last_token_logits=False, + use_flashinfer_fused_rope=None, # default to using flash-infer if available + # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM + ), ) - inference_wrapped_model = GPTInferenceWrapper( - gpt_model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_context) inference_wrapped_model.model_is_pipeline_parallel = not ( parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() diff --git a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py index 31bf415ba56..50db5cc0afc 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_vlm_text_generation_controller.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import copy import os import random @@ -13,9 +15,6 @@ from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.inference_request import InferenceRequest, Status, VLMInferenceRequest -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import ( VLMInferenceWrapper, ) @@ -92,19 +91,9 @@ def setup_method(self, method): self.image_token_index = self.model.image_token_index self.model = Float16Module(self.model.config, self.model) - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=self.language_hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.float, - padded_vocab_size=self.language_vocab_size, - ) - - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(max_batch_size=8, max_sequence_length=2560) - inference_wrapped_model = VLMInferenceWrapper( - self.model, inference_wrapper_config, inference_context - ) + inference_wrapped_model = VLMInferenceWrapper(self.model, inference_context) self.mock_tokenizer = mock.Mock() diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py index cf3bd40ee4b..87aba9c6ed9 100644 --- a/tests/unit_tests/models/test_gpt_model.py +++ b/tests/unit_tests/models/test_gpt_model.py @@ -12,6 +12,7 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams @@ -392,14 +393,18 @@ def test_dynamic_inference_padding_with_fp8(self): config = self.gpt_model.config inference_context = DynamicInferenceContext( - params_dtype=config.params_dtype, - num_layers=config.num_layers, - kv_channels=config.hidden_size // config.num_attention_heads, - num_attention_heads=config.num_attention_heads, - max_sequence_length=self.gpt_model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, + model_config=TransformerConfig( + params_dtype=config.params_dtype, + num_layers=config.num_layers, + kv_channels=config.hidden_size // config.num_attention_heads, + num_attention_heads=config.num_attention_heads, + ), + inference_config=InferenceConfig( + max_sequence_length=self.gpt_model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, + ), ) # Add a request with 10 tokens. Since 10 is not a multiple of 64, diff --git a/tests/unit_tests/models/test_gpt_model_batch_invariant.py b/tests/unit_tests/models/test_gpt_model_batch_invariant.py index ead9125e5ec..9ab7e445c0d 100644 --- a/tests/unit_tests/models/test_gpt_model_batch_invariant.py +++ b/tests/unit_tests/models/test_gpt_model_batch_invariant.py @@ -5,17 +5,15 @@ import torch import torch.distributed as dist +from megatron.core.inference.config import InferenceConfig from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.models.gpt.gpt_model import GPTModel @@ -91,6 +89,8 @@ def _build_flash_attn_bik_model(seq_len: int, vocab_size: int, hidden_size: int normalization="RMSNorm", params_dtype=torch.bfloat16, attention_backend=AttnBackend.flash, + fp32_residual_connection=False, + nccl_all_reduce_for_prefill=False, ) cfg.fp16 = False cfg.bf16 = True @@ -184,32 +184,21 @@ def test_dynamic_engine_matches_batched_forward_rl(self): inference_model = Float16Module(base_model.config, base_model).cuda().eval() ctx = DynamicInferenceContext( - params_dtype=torch.bfloat16, - num_layers=base_model.config.num_layers, - kv_channels=base_model.config.kv_channels, - num_attention_heads=base_model.config.num_attention_heads, - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, + model_config=base_model.config, + inference_config=InferenceConfig( + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, + ), ) - wrapper_cfg = InferenceWrapperConfig( - hidden_size=base_model.config.hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.bfloat16, - padded_vocab_size=vocab_size, - inference_max_seq_length=seq_len, - inference_max_requests=8, - nccl_all_reduce_for_prefill=False, - ) - wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) + wrapper = GPTInferenceWrapper(inference_model, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = SimpleTextGenerationController(wrapper, tokenizer) + controller = TextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) @@ -273,32 +262,21 @@ def test_dynamic_engine_is_batch_invariant(self): def _run_engine_with_order(order): ctx = DynamicInferenceContext( - params_dtype=torch.bfloat16, - num_layers=base_model.config.num_layers, - kv_channels=base_model.config.kv_channels, - num_attention_heads=base_model.config.num_attention_heads, - max_sequence_length=seq_len, - buffer_size_gb=0.125, - block_size_tokens=16, - num_cuda_graphs=None, - materialize_only_last_token_logits=False, - use_cuda_graphs_for_non_decode_steps=False, - unified_memory_level=0, + model_config=based_model.config, + inference_config=InferenceConfig( + max_sequence_length=seq_len, + buffer_size_gb=0.125, + block_size_tokens=16, + num_cuda_graphs=None, + materialize_only_last_token_logits=False, + use_cuda_graphs_for_non_decode_steps=False, + unified_memory_level=0, + ), ) - wrapper_cfg = InferenceWrapperConfig( - hidden_size=base_model.config.hidden_size, - inference_batch_times_seqlen_threshold=-1, - fp32_residual_connection=False, - params_dtype=torch.bfloat16, - padded_vocab_size=vocab_size, - inference_max_seq_length=seq_len, - inference_max_requests=8, - nccl_all_reduce_for_prefill=False, - ) - wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx) + wrapper = GPTInferenceWrapper(inference_model, ctx) tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0) - controller = SimpleTextGenerationController(wrapper, tokenizer) + controller = TextGenerationController(wrapper, tokenizer) engine = DynamicInferenceEngine( controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123 ) diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py index 9eb7b2dea9a..29e3630d7bb 100644 --- a/tests/unit_tests/models/test_mamba_model.py +++ b/tests/unit_tests/models/test_mamba_model.py @@ -10,6 +10,7 @@ from megatron.core import parallel_state from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.inference.config import InferenceConfig, MambaInferenceStateConfig from megatron.core.inference.contexts import BaseInferenceContext, StaticInferenceContext from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext from megatron.core.inference.inference_request import DynamicInferenceRequest @@ -21,12 +22,7 @@ from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module -from megatron.core.utils import ( - divide, - get_mamba_inference_state_config_from_model, - is_fa_min_version, - is_torch_min_version, -) +from megatron.core.utils import divide, is_fa_min_version, is_torch_min_version from tests.unit_tests.test_utilities import Utils @@ -344,20 +340,17 @@ def test_dynamic_inference_padding_with_fp8(self): self.model.eval() config = self.model.config - mamba_inference_state_config = get_mamba_inference_state_config_from_model( - self.model.module - ) + mamba_inference_state_config = MambaInferenceStateConfig.from_model(self.model.module) inference_context = DynamicInferenceContext( - params_dtype=config.params_dtype, - num_layers=config.num_layers, - kv_channels=config.hidden_size // config.num_attention_heads, - num_attention_heads=config.num_attention_heads, - max_sequence_length=self.model.module.max_sequence_length, - buffer_size_gb=1.0, - block_size_tokens=256, - materialize_only_last_token_logits=False, - mamba_inference_state_config=mamba_inference_state_config, + model_config=self.model.config, + inference_config=InferenceConfig( + max_sequence_length=self.model.module.max_sequence_length, + buffer_size_gb=1.0, + block_size_tokens=256, + materialize_only_last_token_logits=False, + mamba_inference_state_config=mamba_inference_state_config, + ), ) # Add a request with 10 tokens. Since 10 is not a multiple of 64 (TOKEN_ROUNDER), diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index 3c7ae93a17c..a5590a0ffad 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -166,6 +166,7 @@ "moe_layer_freq": 1, "moe_layer_recompute": False, "moe_pad_expert_input_to_capacity": False, + "moe_pad_experts_for_cuda_graph_inference": False, "moe_per_layer_logging": False, "moe_permute_fusion": False, "moe_router_bias_update_rate": 0.001, @@ -197,6 +198,7 @@ "mtp_num_layers": None, "mtp_standalone": False, "multi_latent_attention": False, + "nccl_all_reduce_for_prefill": False, "no_rope_freq": None, "no_sync_func": None, "normalization": "RMSNorm", diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py index a6b1b5c8398..c09c788ca8e 100644 --- a/tools/run_dynamic_text_generation_server.py +++ b/tools/run_dynamic_text_generation_server.py @@ -5,25 +5,19 @@ import torch -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.text_generation_server.dynamic_text_gen_server import run_flask_server -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer -from megatron.core.utils import get_mamba_inference_state_config_from_model, trace_async_exceptions +from megatron.core.utils import trace_async_exceptions +from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine from megatron.post_training.arguments import add_modelopt_args -from megatron.training import get_args, get_tokenizer +from megatron.training import get_args from megatron.training.initialize import initialize_megatron def add_text_generation_server_args(parser: argparse.ArgumentParser): """Adds the required command line arguments for running the text generation server.""" parser = add_modelopt_args(parser) - parser = add_dynamic_inference_args(parser) + parser = add_inference_args(parser) parser.add_argument("--port", type=int, default=5000, help="Port for Flask server to run on") return parser @@ -74,36 +68,12 @@ async def run_text_generation_server( args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) - args = get_args() - model = get_model() - - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - # Enable return_log_probs to allow prompt logprobs computation for echo=True requests # This sets materialize_only_last_token_logits=False in the inference context, # which is required for lm-eval compatibility (loglikelihood evaluation tasks) + args = get_args() args.return_log_probs = True - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) - - controller = get_inference_controller(model, context) - - engine = DynamicInferenceEngine( - controller, - context, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, - ) + engine = get_dynamic_inference_engine() asyncio.run(run_text_generation_server(engine, args.inference_coordinator_port, args.port)) diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py index 32d61444530..430bb7ebb9a 100644 --- a/tools/run_inference_performance_test.py +++ b/tools/run_inference_performance_test.py @@ -10,33 +10,31 @@ from gpt_builders import gpt_builder from mamba_builders import mamba_builder -from megatron.core.inference.contexts import DynamicInferenceContext +from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import DynamicInferenceEngine, StaticInferenceEngine from megatron.core.inference.engines.abstract_engine import AbstractEngine -from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.inference_request import ( + DynamicInferenceRequestRecord, + InferenceRequest, +) from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine from model_provider import model_provider sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) -import asyncio from functools import partial -from typing import List, Union +from typing import List -from examples.inference.gpt.utils import add_common_inference_args from megatron.core import mpu from megatron.training import get_args, get_model, get_tokenizer from megatron.training.checkpointing import load_checkpoint @@ -47,7 +45,7 @@ def add_inference_benchmarking_args(parser): """Inference benchmarking arguments.""" - parser = add_common_inference_args(parser) + parser = add_inference_args(parser) group = parser.add_argument_group(title='inference_benchmarking') @@ -60,7 +58,6 @@ def add_inference_benchmarking_args(parser): group.add_argument( "--benchmark-profile", action="store_true", default=False, help="If set, profile" ) - group.add_argument('--stream', action="store_true", default=False, help="If set, stream tokens") return parser @@ -74,24 +71,13 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs Returns: AbstractBackend: The chosen backend """ - tokenizer = get_tokenizer() - - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_requests=args.inference_max_batch_size, - inference_max_seq_length=args.inference_max_seq_length, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - moe_pad_experts_for_cuda_graph_inference=args.moe_pad_experts_for_cuda_graph_inference, - ) - - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) if args.engine_type == "static": - inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) + tokenizer = get_tokenizer() + context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_sequence_length + ) + inference_wrapped_model = GPTInferenceWrapper(model, context) inference_wrapped_model.model_is_pipeline_parallel = not ( mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() ) @@ -100,98 +86,7 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs ) return StaticInferenceEngine(text_generation_controller=text_generation_controller) elif args.engine_type == "dynamic": - context = DynamicInferenceContext( - params_dtype=args.params_dtype, - num_layers=args.num_layers, - kv_channels=args.kv_channels, - num_attention_heads=( - args.num_query_groups if args.group_query_attention else args.num_attention_heads - ), - max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if args.cuda_graph_impl == "local" - else None - ), - buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, - buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, - max_requests_override=args.inference_dynamic_batching_max_requests_override, - max_tokens_override=args.inference_dynamic_batching_max_tokens_override, - block_size_tokens=args.inference_dynamic_batching_block_size, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=args.use_flashinfer_fused_rope, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, - ) - inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context=context - ) - inference_wrapped_model.model_is_pipeline_parallel = not ( - mpu.is_pipeline_first_stage() and mpu.is_pipeline_last_stage() - ) - text_generation_controller = TextGenerationController( - inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer - ) - return DynamicInferenceEngine( - text_generation_controller, - context, - termination_id=-1, - enable_cuda_graph=args.cuda_graph_impl == "local", - random_seed=args.seed, - ) - - -async def generate( - inference_engine: Union[StaticInferenceEngine, DynamicInferenceEngine], - sampling_params: SamplingParams, - prompts: List[str], - inference_requests: List[InferenceRequest] = None, -) -> List[InferenceRequest]: - async def collect_stream(prompt, request_id, stream_generator): - async for output in stream_generator: - pass - - if inference_requests is None: - assert prompts is not None - inference_requests = [None for _ in range(len(prompts))] - elif prompts is None: - assert inference_requests is not None - tokenizer = get_tokenizer() - prompts = [tokenizer.detokenize(request.prompt_tokens) for request in inference_requests] - - request_ids: List[int] = [ - inference_engine.add_request( - prompt=prompt, - inference_request=inference_request, - inference_parameters=sampling_params, - streaming=True, - ) - for prompt, inference_request in zip(prompts, inference_requests) - ] - stream_generators = [ - inference_engine.get_stream_generator(request_id) for request_id in request_ids - ] - - tasks = [ - asyncio.create_task(collect_stream(prompt, request_id, stream_generator)) - for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators) - ] - - await inference_engine.run_engine_async() - await asyncio.gather(*tasks) - - results: List[InferenceRequest] = [ - inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids - ] - - return results + return get_dynamic_inference_engine(model=model) def get_random_prompt_tokens(tokenizer, num_input_tokens) -> List[int]: @@ -232,14 +127,12 @@ def generate_dynamic( request_id = REQUEST_ID REQUEST_ID += 1 prompt_tokens = request.prompt_tokens - inference_engine.add_request( - request_id, prompt_tokens, request.inference_parameters, - ) + inference_engine.add_request(request_id, prompt_tokens, request.inference_parameters) start_time = time.perf_counter() all_finished_requests = [] while inference_engine.has_unfinished_requests(): - result = inference_engine.step(verbose=False) + result = inference_engine.step() finished_requests = result["finished_requests"] for request in finished_requests: req_id = request.request_id @@ -257,8 +150,6 @@ def generate_dynamic( def main(): """Main program.""" - # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) - # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) initialize_megatron( extra_args_provider=add_inference_benchmarking_args, args_defaults={ @@ -298,13 +189,14 @@ def main(): return_log_probs=args.return_log_probs, top_n_logprobs=args.top_n_logprobs, num_tokens_to_generate=args.num_tokens_to_generate, + termination_id=-1, ) sampling_params.add_attributes({"no_early_termination": True}) requests = [] if args.num_input_tokens is not None: assert args.prompts is None - batch_size = args.inference_max_batch_size + batch_size = args.inference_max_requests for i in range(batch_size): prompt_tokens = get_random_prompt_tokens(tokenizer, args.num_input_tokens) requests.append( @@ -327,33 +219,27 @@ def main(): ) ) - if args.cuda_graph_impl == "local": - print(f"Running warmup for CUDA graphs...") - warmup_sampling_params = SamplingParams(num_tokens_to_generate=10) - warmup_sampling_params.add_attributes({"no_early_termination": True}) + # TODO(ksanthanam): Use a command line argument for warmup iterations + for i in range(3): + print(f"Running warmup iteration {i+1}...") + warmup_sampling_params = SamplingParams(num_tokens_to_generate=10, termination_id=-1) inference_engine.generate(prompts=["warmup"], sampling_params=warmup_sampling_params) if args.benchmark_profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() - if args.stream: - if args.engine_type == "dynamic": - raise NotImplementedError("Streaming not supported with DynamicInferenceEngine") - results: List[InferenceRequest] = asyncio.run( - generate( - inference_engine, sampling_params, prompts=args.prompts, inference_requests=requests - ) + if args.engine_type == "static": + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params ) else: - if args.engine_type == "static": - results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, inference_requests=requests, sampling_params=sampling_params - ) - elif args.engine_type == "dynamic": - results: List[InferenceRequest] = generate_dynamic( - args, requests, inference_engine, - ) + prompts = [request.prompt_tokens for request in requests] + records: List[DynamicInferenceRequestRecord] = inference_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) + results: List[InferenceRequest] = [record.merge() for record in records] + end_time = time.perf_counter() latency = end_time - start_time @@ -378,6 +264,10 @@ def main(): result_dict['generated_output'] = tokenizer.detokenize(result.generated_tokens) print(result_dict) + total_output_tokens = args.num_tokens_to_generate * args.inference_max_requests + throughput = total_output_tokens / latency + print(f"Throughput: {throughput} output tokens / second") + if __name__ == "__main__": main() diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index 350173dc16f..89c1cfa5b86 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -22,9 +22,6 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) -from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( - InferenceWrapperConfig, -) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, @@ -63,27 +60,15 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi tokenizer = get_tokenizer() - inference_wrapper_config = InferenceWrapperConfig( - hidden_size=args.hidden_size, - inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, - fp32_residual_connection=args.fp32_residual_connection, - params_dtype=args.params_dtype, - padded_vocab_size=args.padded_vocab_size, - inference_max_seq_length=args.inference_max_seq_length, - inference_max_requests=args.inference_max_batch_size, - nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill, - moe_pad_experts_for_cuda_graph_inference = args.moe_pad_experts_for_cuda_graph_inference - ) - inference_context = StaticInferenceContext.from_config(inference_wrapper_config) + inference_context = StaticInferenceContext(args.inference_max_requests, args.inference_max_sequence_length) inference_wrapped_model = GPTInferenceWrapper( - model, inference_wrapper_config, inference_context + model, inference_context ) text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) return StaticInferenceEngine( text_generation_controller=text_generation_controller, - max_batch_size=args.inference_max_batch_size, ) @@ -166,14 +151,6 @@ def main(model_type: str = "gpt"): model = model[0] model.eval() - if args.max_batch_size is not None: - assert args.inference_max_batch_size is not None - args.inference_max_batch_size = max(args.inference_max_batch_size, args.max_batch_size) - warnings.warn( - "`--max-batch-size` has been deprecated in favor of `--inference-max-requests`, " - f"setting maximum batch size to {args.inference_max_batch_size}" - ) - inference_engine = get_inference_engine(args, model) if args.cuda_graph_impl == "local": diff --git a/train_rl.py b/train_rl.py index cfc010b3c04..4b5cec5fcc8 100644 --- a/train_rl.py +++ b/train_rl.py @@ -370,6 +370,8 @@ def __getitem__(self, idx): if __name__ == "__main__": + from megatron.inference.utils import add_inference_args + # Temporary for transition to core datasets train_valid_test_datasets_provider.is_distributed = True @@ -401,4 +403,5 @@ def _model_builder( ModelType.encoder_or_decoder, forward_step, args_defaults={}, + extra_args_provider=add_inference_args, ) From adce1479bd83a707d11aea791ffcf4eba4a9f334 Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Wed, 4 Feb 2026 07:23:23 -0800 Subject: [PATCH 053/231] Add DistributedInitConfig (#3173) Signed-off-by: Maanu Grover --- megatron/training/arguments.py | 52 ++-------------------- megatron/training/common_config.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 48 deletions(-) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 131392d2c3d..9951203f18f 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2017,9 +2017,6 @@ def _add_training_args(parser): dest='check_for_large_grads') group.add_argument('--result-rejected-tracker-filename', type=str, default=None, help='Optional name of file tracking `result_rejected` events.') - group.add_argument('--disable-gloo-process-groups', action='store_false', - dest='enable_gloo_process_groups', - help='Disables creation and usage of Gloo process groups.') group.add_argument('--tp-comm-overlap-cfg', type=str, default=None, help='Config file when tp_comm_overlap is enabled.') @@ -2092,10 +2089,6 @@ def _add_training_args(parser): '--use-legacy-models to not use core models.') group.add_argument('--use-legacy-models', action='store_true', help='Use the legacy Megatron models, not Megatron-Core models.') - group.add_argument('--high-priority-stream-groups', nargs='*', type=str, default=[], - help='The communicator group names to use high priority streams.') - group.add_argument('--disable-jit-fuser', action='store_true', - help='Disable the JIT fuser.') return parser @@ -2204,7 +2197,10 @@ def _add_mixed_precision_args(parser): def _add_distributed_args(parser): - group = parser.add_argument_group(title='distributed') + from megatron.training.common_config import DistributedInitConfig + + dist_init_factory = ArgumentGroupFactory(DistributedInitConfig) + group = dist_init_factory.build_group(parser, "distributed init") group.add_argument('--decoder-first-pipeline-num-layers', type=int, default=None, @@ -2232,20 +2228,8 @@ def _add_distributed_args(parser): group.add_argument('--no-overlap-p2p-communication', action='store_false', help='overlap pipeline parallel communication with forward and backward chunks in 1F1B', dest='overlap_p2p_comm') - group.add_argument('--distributed-backend', default='nccl', - choices=['nccl', 'gloo'], - help='Which backend to use for distributed training.') - group.add_argument('--distributed-timeout-minutes', type=int, default=10, - help='Default timeout minutes for torch.distributed.') - group.add_argument('--distributed-timeout-seconds-after-init', type=int, default=None, - help='Timeout seconds for process groups after initialization.' - 'This timeout is applied to all process groups after initialization.') group.add_argument('--overlap-grad-reduce', action='store_true', default=False, help='If set, overlap DDP grad reduce.') - group.add_argument('--no-align-grad-reduce', action='store_false', - help='If not set, all PP stages will launch gradient reduces simultaneously. ' - 'Otherwise, each PP stage will independently launch as needed.', - dest='align_grad_reduce') group.add_argument('--ddp-num-buckets', type=int, default=None, help='Number of buckets for data-parallel communication') group.add_argument('--ddp-bucket-size', type=int, default=None, @@ -2272,14 +2256,6 @@ def _add_distributed_args(parser): group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false', help='If not set, use scatter/gather to optimize communication of tensors in pipeline.', dest='scatter_gather_tensors_in_pipeline') - group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')), - help='local rank passed from distributed launcher.') - group.add_argument('--lazy-mpu-init', type=bool, required=False, - help='If set to True, initialize_megatron() ' - 'skips DDP initialization and returns function to ' - 'complete it instead. Also turns on ' - '--use-cpu-initialization flag. This is for ' - 'external DDP manager.' ) group.add_argument('--use-distributed-optimizer', action='store_true', help='Use distributed optimizer.') group.add_argument('--use-nccl-ub', action='store_true', dest='nccl_ub', @@ -2292,18 +2268,9 @@ def _add_distributed_args(parser): group.add_argument('--fsdp-manual-registration', action='store_true', dest='fsdp_manual_registration', default=False, help='Manually register the FSDP communication buffers to NCCL user buffer.' 'This option is only effective when use-megatron-fsdp and use-nccl-ub is set.') - group.add_argument('--use-sharp', action='store_true', - help='Required to enable SHARP communication.') - group.add_argument('--sharp-enabled-group', type=str, default=None, - choices=['dp', 'dp_replica'], - help='IB SHARP can be enabled from only one communication group. ' - 'By default, it is enabled from dp group. ' - 'Available options: [dp, dp_replica]') group.add_argument('--create-all-gather-group', action='store_true', help='Create a separate process group for all-gather operations ' 'to overlap reduce-scatter and all-gather operations.') - group.add_argument('--use-megatron-fsdp', action='store_true', - help='Use the Megatron FSDP code path in DDP.') group.add_argument('--data-parallel-sharding-strategy', type=str, default='no_shard', choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'], help='Sharding strategy of data parallelism.') @@ -2332,9 +2299,6 @@ def _add_distributed_args(parser): help='If set, enable full sharding in megatron-fsdp Hybrid Sharded Data Parallel (HSDP) mode.') group.add_argument('--num-distributed-optimizer-instances', type=int, default=1, help='Number of Distributed Optimizer copies across Data Parallel domain.') - group.add_argument('--use-torch-fsdp2', action='store_true', - help='Use the torch FSDP2 implementation. FSDP2 has not been tested with pipeline parallelism, ' - 'and may contain bugs.') group.add_argument('--torch-fsdp2-no-reshard-after-forward', action='store_false', dest='torch_fsdp2_reshard_after_forward', help='Whether to reshard weights after forward pass when using PyTorch FSDP2. ' 'Set to enable FSDP ZeRO-2.') @@ -2344,14 +2308,6 @@ def _add_distributed_args(parser): 'all layers will share the same communication type. Users can also ' 'specify separated types for each layer like ' '--cp-comm-type p2p p2p a2a a2a a2a+p2p a2a+p2p') - group.add_argument('--nccl-communicator-config-path', type=str, default=None, - help='Path to the yaml file with NCCL communicator ' - 'configurations. The number of min/max thread groups and thread ' - 'group cluster size of each communicator can be configured by ' - 'setting `min_ctas`, `max_ctas`, and `cga_cluster_size`.') - group.add_argument('--use-tp-pp-dp-mapping', action='store_true', default=False, - help='If set, distributed ranks initialize order is changed ' - 'from tp-cp-ep-dp-pp to tp-cp-ep-pp-dp.') group.add_argument('--fake-process-group', action='store_true', default=False, help='If set, initialize with fake distributed process group and all distributed communication operations will be skipped. \ This is quite useful for profiling memory usage of distributed training with just one GPU. \ diff --git a/megatron/training/common_config.py b/megatron/training/common_config.py index d1096e91154..06c84bf7f13 100644 --- a/megatron/training/common_config.py +++ b/megatron/training/common_config.py @@ -1,5 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from dataclasses import dataclass, field +from typing import Literal +import os @dataclass(kw_only=True) class RNGConfig: @@ -54,3 +56,71 @@ class ProfilingConfig: nvtx_ranges: bool = False """Enable NVTX range annotations for profiling. When enabled, inserts NVTX markers to categorize execution in profiler output.""" + + +@dataclass(kw_only=True) +class DistributedInitConfig: + """Configuration settings for distributed training initialization.""" + + distributed_backend: Literal["nccl", "gloo"] = "nccl" + """Which backend to use for distributed training.""" + + distributed_timeout_minutes: int = 10 + """Timeout minutes for torch.distributed.""" + + align_grad_reduce: bool = True + """If not set, all PP stages will launch gradient reduces simultaneously. + Otherwise, each PP stage will independently launch as needed. + """ + + local_rank: int = field(default_factory=lambda: int(os.getenv("LOCAL_RANK", "0"))) + """local rank passed from distributed launcher.""" + + lazy_mpu_init: bool = False + """If set to True, initialize_megatron() skips DDP initialization and returns function to complete it instead. + Also turns on --use-cpu-initialization flag. This is for external DDP manager.""" + + use_megatron_fsdp: bool = False + """Use Megatron's Fully Sharded Data Parallel. Cannot be used together with use_torch_fsdp2.""" + + use_torch_fsdp2: bool = False + """Use the torch FSDP2 implementation. FSDP2 is not currently working with Pipeline Parallel. + It is still not in a stable release stage, and may therefore contain bugs or other + potential issues.""" + + nccl_communicator_config_path: str | None = None + """Path to the yaml file with NCCL communicator configurations. The number of min/max thread + groups and thread group cluster size of each communicator can be configured by setting + `min_ctas`, `max_ctas`, and `cga_cluster_size`.""" + + use_tp_pp_dp_mapping: bool = False + """If set, distributed ranks initialize order is changed from tp-cp-ep-dp-pp to tp-cp-ep-pp-dp. + """ + + enable_gloo_process_groups: bool = field(default=True, metadata={"argparse_meta": {"arg_names": ["--disable-gloo-process-groups"]}}) + """If enabled, create Gloo process groups for communications.""" + + use_sharp: bool = False + """Set the use of SHARP for the collective communications of data-parallel process groups. + When `True`, run barrier within each data-parallel process group, + which specifies the SHARP application target groups. + """ + + sharp_enabled_group: Literal["dp", "dp_replica"] | None = None + """IB SHARP can be enabled from only one communication group. + By default, it is enabled from dp group if not specified and use_sharp=True. + Available options: [dp, dp_replica] + """ + + high_priority_stream_groups: list[str] | None = field(default_factory=list) + """Specify which communicator groups should use high priority streams during creation. + Assigning high priority to communication streams ensures that communication kernels + are scheduled with higher priority, minimizing the exposed communication when it is + overlapped with other computation kernels. + """ + + distributed_timeout_seconds_after_init: int | None = None + """Timeout in seconds for process groups after initialization. This timeout is applied to all process groups after initialization and the first iteration completes.""" + + disable_jit_fuser: bool = False + """Disable the JIT fuser.""" From f3e6cc81e13f234b88dd919f0576526ef73d61d1 Mon Sep 17 00:00:00 2001 From: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com> Date: Wed, 4 Feb 2026 07:38:04 -0800 Subject: [PATCH 054/231] Fix checkpoint converter missing parallel group initialization (#3217) --- pyproject.toml | 2 +- tools/checkpoint/saver_base.py | 6 ++++ uv.lock | 59 +++++++++++++++++++++++----------- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 567954ca4a1..468e1b02f90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ Download = "https://github.com/NVIDIA/Megatron-LM/releases" Homepage = "https://github.com/NVIDIA/Megatron-LM/megatron/core" [project.optional-dependencies] -mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"] +mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers", "accelerate"] dev = [ "nvidia-modelopt[torch]; sys_platform != 'darwin'", diff --git a/tools/checkpoint/saver_base.py b/tools/checkpoint/saver_base.py index 4958dc99ed7..7191762b6a9 100644 --- a/tools/checkpoint/saver_base.py +++ b/tools/checkpoint/saver_base.py @@ -170,9 +170,15 @@ def initialize_megatron_env(self): # For backward compatibility during local parallel states refactoring fake_tp_group = _ConverterFakeProcessGroup(size=self.args.target_tensor_parallel_size) + fake_pp_group = _ConverterFakeProcessGroup(size=self.args.target_pipeline_parallel_size) fake_ep_group = _ConverterFakeProcessGroup(size=self.args.target_expert_parallel_size) + fake_dp_group = _ConverterFakeProcessGroup(size=1) mpu._TENSOR_MODEL_PARALLEL_GROUP = fake_tp_group + mpu._PIPELINE_MODEL_PARALLEL_GROUP = fake_pp_group mpu._EXPERT_MODEL_PARALLEL_GROUP = fake_ep_group + mpu._DATA_PARALLEL_GROUP = fake_dp_group + mpu._DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group + mpu._INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group fused_kernels.load(self.margs) try: diff --git a/uv.lock b/uv.lock index 1867e8aaddf..4b51612c3a3 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'linux'", @@ -34,6 +34,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" }, ] +[[package]] +name = "accelerate" +version = "1.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4a/8e/ac2a9566747a93f8be36ee08532eb0160558b07630a081a6056a9f89bf1d/accelerate-1.12.0.tar.gz", hash = "sha256:70988c352feb481887077d2ab845125024b2a137a5090d6d7a32b57d03a45df6", size = 398399, upload-time = "2025-11-21T11:27:46.973Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/d2/c581486aa6c4fbd7394c23c47b83fa1a919d34194e16944241daf9e762dd/accelerate-1.12.0-py3-none-any.whl", hash = "sha256:3e2091cd341423207e2f084a6654b1efcd250dc326f2a37d6dde446e07cabb11", size = 380935, upload-time = "2025-11-21T11:27:44.522Z" }, +] + [[package]] name = "accessible-pygments" version = "0.0.5" @@ -262,10 +281,10 @@ name = "anyio" version = "4.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "idna" }, { name = "sniffio" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" } wheels = [ @@ -698,7 +717,7 @@ name = "cffi" version = "2.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pycparser", marker = "implementation_name != 'PyPy'" }, + { name = "pycparser", marker = "implementation_name != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } wheels = [ @@ -869,7 +888,7 @@ name = "click" version = "8.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } wheels = [ @@ -1252,7 +1271,7 @@ version = "0.1.0" source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0#d5363b4a418128cd8111983b191c4b8869a9766b" } dependencies = [ { name = "absl-py" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "typing-extensions" }, ] @@ -1261,7 +1280,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ @@ -1779,7 +1798,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pyyaml" }, { name = "requests" }, @@ -2272,6 +2291,7 @@ lts = [ { name = "wget" }, ] mlm = [ + { name = "accelerate" }, { name = "flask-restful" }, { name = "sentencepiece" }, { name = "tiktoken" }, @@ -2333,6 +2353,7 @@ test = [ [package.metadata] requires-dist = [ + { name = "accelerate", marker = "extra == 'mlm'" }, { name = "av", marker = "extra == 'dev'" }, { name = "av", marker = "extra == 'lts'" }, { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, @@ -4403,7 +4424,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "attrs" }, { name = "rpds-py" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } wheels = [ @@ -5289,7 +5310,7 @@ version = "0.50.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" } wheels = [ @@ -5310,7 +5331,7 @@ name = "sympy" version = "1.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mpmath", marker = "sys_platform != 'linux'" }, + { name = "mpmath", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } wheels = [ @@ -5607,15 +5628,15 @@ name = "torch" version = "2.9.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "sys_platform != 'linux'" }, - { name = "fsspec", marker = "sys_platform != 'linux'" }, - { name = "jinja2", marker = "sys_platform != 'linux'" }, + { name = "filelock", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "fsspec", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "jinja2", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux'" }, - { name = "sympy", marker = "sys_platform != 'linux'" }, - { name = "triton", marker = "sys_platform == 'never'" }, - { name = "typing-extensions", marker = "sys_platform != 'linux'" }, + { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform != 'linux') or (python_full_version < '3.12' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "sympy", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "triton", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/5f/56/9577683b23072075ed2e40d725c52c2019d71a972fab8e083763da8e707e/torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e", size = 104207681, upload-time = "2025-11-12T15:19:56.48Z" }, @@ -5729,7 +5750,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } wheels = [ From d558b5fa7e298d5f258efb3050a07082cbaf7325 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Thu, 5 Feb 2026 02:20:38 +0800 Subject: [PATCH 055/231] Skip empty sequences and chunks in MTP tensor roll (#3035) Co-authored-by: Rabeeh Karimi Mahabadi --- .../transformer/multi_token_prediction.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 2edb652bfc6..afd7c0516e1 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -265,6 +265,13 @@ def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=No # the idx has been multiplied by cp_size, need to divide it by cp_size to get the local idx local_start_idx = start_idx // cp_size local_end_idx = end_idx // cp_size + + # Skip empty sequences - this can happen when a sequence is very short and + # after dividing by cp_size, the local slice has zero length + local_seq_len = local_end_idx - local_start_idx + if local_seq_len == 0: + continue + tensor_slice = rolled_tensor[..., local_start_idx:local_end_idx].clone() # The following code is very similar as the code in roll_tensor function @@ -274,6 +281,15 @@ def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=No tensor_send_list = [] tensor_recv_list = [] for chunk in rolled_chunks: + # Skip empty chunks that can occur when the sequence slice is very small + if chunk.size(dims) == 0: + tensor_send_list.append( + torch.empty(chunk.shape[:-1], dtype=chunk.dtype, device=chunk.device) + ) + tensor_recv_list.append( + torch.empty(chunk.shape[:-1], dtype=chunk.dtype, device=chunk.device) + ) + continue boundary = chunk.select(dims, shifts).contiguous().clone() tensor_send_list.append(boundary) tensor_recv_list.append(torch.empty_like(boundary)) @@ -297,6 +313,9 @@ def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=No index = [slice(None)] * rolled_chunks[0].dim() index[dims] = shifts for chunk, recv in zip(rolled_chunks, tensor_recv_list): + # Skip empty chunks + if chunk.size(dims) == 0: + continue chunk[tuple(index)] = recv seq_result = torch.cat(rolled_chunks, dim=dims) From f708b5da8d22f978b26fe139365cf9cec15cca6e Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Wed, 4 Feb 2026 15:09:25 -0500 Subject: [PATCH 056/231] Implement get_parameters for ChainedOptimizer (#3201) Co-authored-by: Jeffrey Chen Co-authored-by: Xin Yao --- megatron/core/optimizer/optimizer.py | 9 +++++++ tests/unit_tests/test_optimizer.py | 39 +++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py index 2c33d7e701d..df8ec8ef613 100644 --- a/megatron/core/optimizer/optimizer.py +++ b/megatron/core/optimizer/optimizer.py @@ -12,6 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch +from typing_extensions import override try: from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale @@ -1122,6 +1123,14 @@ def param_groups(self) -> List[dict]: param_groups += optimizer.param_groups return param_groups + @override + def get_parameters(self) -> List[torch.nn.Parameter]: + """Get list of parameters wrapped in all chained optimizers.""" + params = [] + for optimizer in self.chained_optimizers: + params.extend(optimizer.get_parameters()) + return params + @property def state(self) -> ProxyDict: """ diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py index 6b1da8c4e3f..22b15f41e90 100644 --- a/tests/unit_tests/test_optimizer.py +++ b/tests/unit_tests/test_optimizer.py @@ -28,7 +28,7 @@ from megatron.core.transformer import TransformerConfig from megatron.core.utils import is_te_min_version, is_torch_min_version from tests.unit_tests.test_utilities import Utils -from tests.unit_tests.test_utils import _deinit_distributed, _init_distributed +from tests.unit_tests.test_utils import _init_distributed try: # Check if FP8 block scaling is available. @@ -326,6 +326,43 @@ def to_cuda(d): assert list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda +def test_chained_optimizer_get_parameters(): + """Test ChainedOptimizer.get_parameters() aggregates params from all sub-optimizers. + + Regression test: without the get_parameters() override, ChainedOptimizer would + access self.optimizer which asserts only one optimizer exists, failing with VPP/MoE. + """ + + class MockOptimizer: + """Mock that mimics MegatronOptimizer's get_parameters() interface.""" + + def __init__(self, params): + self.params = list(params) + self.param_groups = [{"params": self.params}] + + def get_parameters(self): + return self.params + + net = Net() + all_params = list(net.parameters()) + + # Test empty + assert ChainedOptimizer([]).get_parameters() == [] + + # Test single optimizer + opt1 = MockOptimizer(all_params[:3]) + assert ChainedOptimizer([opt1]).get_parameters() == opt1.params + + # Test multiple optimizers (the case that previously failed) + opt2 = MockOptimizer(all_params[3:6]) + opt3 = MockOptimizer(all_params[6:]) + chained = ChainedOptimizer([opt1, opt2, opt3]) + result = chained.get_parameters() + + assert len(result) == len(all_params) + assert result == opt1.params + opt2.params + opt3.params + + def test_precision_aware_fused_adam(): try: from transformer_engine.pytorch.optimizers import FusedAdam From 66c432a2c9ca6037f813c36f7cf5c59cb3ffbf38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 4 Feb 2026 21:54:42 +0100 Subject: [PATCH 057/231] ci(fix): Create main/dev image tags (#3252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/01.build.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index 20252e7d045..f8020ca6e12 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -132,6 +132,12 @@ test:build_image: ${IMAGE}:${CI_PIPELINE_ID}-arm64 docker manifest push ${IMAGE}:${CI_PIPELINE_ID} + + if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then + docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:${CI_COMMIT_BRANCH} + docker push ${IMAGE}:${CI_COMMIT_BRANCH} + fi + - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env - cat build.env From e24767f13ad9b2ad1d69c183295df33f558a627c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 4 Feb 2026 22:23:08 +0000 Subject: [PATCH 058/231] ci(hotfix): Skopeo copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/01.build.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index f8020ca6e12..27ede878bd3 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -134,8 +134,7 @@ test:build_image: docker manifest push ${IMAGE}:${CI_PIPELINE_ID} if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then - docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:${CI_COMMIT_BRANCH} - docker push ${IMAGE}:${CI_COMMIT_BRANCH} + skopeo copy --all docker://${IMAGE}:${CI_PIPELINE_ID} docker://${IMAGE}:${CI_COMMIT_BRANCH} fi - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env From d959620907533e1b24a237a239aaa83bcd2548e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 4 Feb 2026 22:31:29 +0000 Subject: [PATCH 059/231] ci(hotfix): Add skopeo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/01.build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index 27ede878bd3..61521295a93 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -121,6 +121,7 @@ test:build_image: KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi SHARED_PATH: /builds/$CI_PROJECT_PATH/shared script: + - apk add skopeo - | set -x From 9d71cb1cd29192a5d23f349268f1f8515c12c017 Mon Sep 17 00:00:00 2001 From: Sanjeev Satheesh Date: Wed, 4 Feb 2026 14:40:13 -0800 Subject: [PATCH 060/231] Reapply "Add MTP support for hybrid models (#2363)" (#3207) --- mamba_builders.py | 8 +- .../common/language_module/language_module.py | 30 +- .../common/model_chunk_schedule_plan.py | 2 +- .../core/models/gpt/fine_grained_callables.py | 4 +- megatron/core/models/gpt/gpt_layer_specs.py | 2 +- megatron/core/models/gpt/gpt_model.py | 112 ++----- .../core/models/mamba/mamba_layer_specs.py | 33 ++ megatron/core/models/mamba/mamba_model.py | 91 +++++- megatron/core/pipeline_parallel/schedules.py | 5 +- megatron/core/ssm/mamba_block.py | 28 +- .../core/ssm/mamba_hybrid_layer_allocation.py | 149 ++++++++- megatron/core/transformer/cuda_graphs.py | 4 +- megatron/core/transformer/moe/moe_layer.py | 12 +- megatron/core/transformer/moe/router.py | 51 ++- .../transformer/multi_token_prediction.py | 305 +++++++++++++++--- .../core/transformer/transformer_config.py | 9 + .../core/transformer/transformer_layer.py | 18 +- megatron/training/arguments.py | 73 +++++ megatron/training/checkpointing.py | 6 + megatron/training/training.py | 26 +- pretrain_mamba.py | 1 + .../unit_tests/models/test_mamba_moe_model.py | 2 + .../ssm/test_mamba_hybrid_layer_allocation.py | 139 +++++++- .../test_multi_token_prediction.py | 263 ++++++++++++++- 24 files changed, 1166 insertions(+), 207 deletions(-) diff --git a/mamba_builders.py b/mamba_builders.py index 6a792ba6ea5..5d31af60475 100644 --- a/mamba_builders.py +++ b/mamba_builders.py @@ -8,6 +8,7 @@ from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec + def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None): print_rank_0('building MAMBA model ...') if config is None: @@ -15,8 +16,10 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p assert args.use_legacy_models is False, "Mamba only supported in Mcore!" if config.transformer_impl == "inference_optimized": - mamba_stack_spec = mamba_inference_stack_spec - assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba" + mamba_stack_spec = mamba_inference_stack_spec + assert ( + not config.inference_fuse_tp_communication + ), "inference_fuse_tp_communication is not supported for Mamba" elif args.spec is not None: mamba_stack_spec = import_module(args.spec) else: @@ -39,6 +42,7 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p rotary_percent=args.rotary_percent, rotary_base=args.rotary_base, pg_collection=pg_collection, + vp_stage=vp_stage, ) for l in range(model.decoder.num_layers_per_pipeline_rank): diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index b0fa6126b63..57975b2958b 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -23,6 +23,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group from megatron.core.utils import ( @@ -255,12 +256,20 @@ def setup_embeddings_and_output_layer(self) -> None: LanguageModule.embedding_warning_printed = True def shared_embedding_or_output_weight(self) -> Tensor: - """Gets the emedding weight or output logit weights when share embedding and output weights set to True. + """Gets the embedding weight or output logit weights when share embedding and output weights set to True + or when use Multi-Token Prediction (MTP). Returns: - Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight + Tensor: During pre processing or MTP process it returns the input embeddings weight while during post processing it returns the final output layers weight """ - if self.pre_process: + if self.pre_process or getattr(self, 'mtp_process', False): + # Multi-Token Prediction (MTP) need both embedding layer and output layer. + # So there will be both embedding layer and output layer in the mtp process stage. + # When share_embeddings_and_output_weights is True, the embedding weight is the + # canonical shared weight and is passed to the output layer during forward. + assert hasattr( + self, 'embedding' + ), f"embedding is needed in this pipeline stage, but it is not initialized." return self.embedding.word_embeddings.weight elif self.post_process: return self.output_layer.weight @@ -293,6 +302,21 @@ def sharded_state_dict( output_layer_weight_key = f'{prefix}output_layer.weight' output_layer_bias_key = f'{prefix}output_layer.bias' + # Multi-Token Prediction (MTP) needs embedding layer in mtp process stage. + # If MTP is not placed in the pre processing stage, we need to maintain a copy of + # embedding layer in the mtp process stage and tie it to the embedding in the pre + # processing stage. + # Note: MTP loss is computed at post_process stage, so the output_layer on mtp_process + # rank doesn't need special tying - it's not used for loss computation. + if getattr(self, 'mtp_process', False) and not self.pre_process: + emb_weight = self.embedding.word_embeddings.weight + tie_word_embeddings_state_dict( + sharded_state_dict, + emb_weight, + first_stage_word_emb_key, + tp_group=self.tp_group, + dp_cp_group=metadata['dp_cp_group'], + ) if self.share_embeddings_and_output_weights: self.tie_embeddings_and_output_weights_state_dict( sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 033e8e808f9..3b0e3a13b76 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): # get flags for latter use is_mtp = isinstance(self.layer, MultiTokenPredictionLayer) is_moe = ( - isinstance(self.layer.transformer_layer.mlp, MoELayer) + isinstance(self.layer.mtp_model_layer.mlp, MoELayer) if is_mtp else isinstance(self.layer.mlp, MoELayer) ) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 7cee9d2973c..e17ed0a5d40 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -613,9 +613,9 @@ def build_mtp_layer_callables(layer): multi-token prediction layer nodes (attention, MLP, etc.) """ - forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) + forward_funcs, backward_dw = build_transformer_layer_callables(layer.mtp_model_layer) attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs - is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) + is_moe = isinstance(layer.mtp_model_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." def submodule_mtp_attn_forward(node, hidden_states): diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 49501ee54eb..bebb4350d27 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend( raise ValueError(f"Invalid spec: {spec}") mtp_layer_spec = get_mtp_layer_spec_for_backend( - transformer_layer_spec=transformer_layer_spec, backend=backend + mtp_model_layer_spec=transformer_layer_spec, backend=backend ) mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0 mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index f44aed613e7..cbd1985002d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -6,7 +6,7 @@ import torch from torch import Tensor -from megatron.core import parallel_state, tensor_parallel +from megatron.core import tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.dist_checkpointing.mapping import ShardedStateDict from megatron.core.inference.contexts import BaseInferenceContext @@ -26,11 +26,9 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer.enums import CudaGraphScope, ModelType from megatron.core.transformer.multi_token_prediction import ( - MTPLossAutoScaler, - MTPLossLoggingHelper, MultiTokenPredictionBlock, - roll_tensor, - tie_word_embeddings_state_dict, + mtp_on_this_rank, + process_mtp_loss, ) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlock @@ -144,7 +142,9 @@ def __init__( self.rotary_base = rotary_base self.rotary_scaling = rope_scaling self.mtp_block_spec = mtp_block_spec - self.mtp_process = mtp_block_spec is not None + self.mtp_process = mtp_block_spec is not None and mtp_on_this_rank( + self.config, ignore_virtual=False, vp_stage=vp_stage + ) if self.pre_process or self.mtp_process: self.embedding = LanguageModelEmbedding( @@ -609,56 +609,19 @@ def _postprocess( return hidden_states if self.config.mtp_num_layers is not None: - mtp_labels = labels.clone() - hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0) - hidden_states = hidden_states_list[0] - if loss_mask is None: - # if loss_mask is not provided, use all ones as loss_mask - loss_mask = torch.ones_like(mtp_labels) - for mtp_layer_number in range(self.config.mtp_num_layers): - # output - mtp_logits, _ = self.output_layer( - hidden_states_list[mtp_layer_number + 1], - weight=output_weight, - runtime_gather_output=runtime_gather_output, - ) - # Calc loss for the current Multi-Token Prediction (MTP) layers. - mtp_labels, _ = roll_tensor( - mtp_labels, - shifts=-1, - dims=-1, - cp_group=self.cp_group, - packed_seq_params=packed_seq_params, - ) - loss_mask, num_tokens = roll_tensor( - loss_mask, - shifts=-1, - dims=-1, - cp_group=self.cp_group, - packed_seq_params=packed_seq_params, - ) - mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits) - mtp_loss = loss_mask * mtp_loss - if self.training: - # TODO(shifangx): remove the use of parallel_state here - # after moving loss logging to loss_func in pretrain_gpt.py - MTPLossLoggingHelper.save_loss_to_tracker( - torch.sum(mtp_loss) / num_tokens, - mtp_layer_number, - self.config.mtp_num_layers, - avg_group=parallel_state.get_data_parallel_group( - with_context_parallel=True - ), - ) - mtp_loss_scale = self.config.mtp_loss_scaling_factor / self.config.mtp_num_layers - if self.config.calculate_per_token_loss: - hidden_states = MTPLossAutoScaler.apply( - hidden_states, mtp_loss_scale * mtp_loss - ) - else: - hidden_states = MTPLossAutoScaler.apply( - hidden_states, mtp_loss_scale * mtp_loss / num_tokens - ) + hidden_states = process_mtp_loss( + hidden_states=hidden_states, + labels=labels, + loss_mask=loss_mask, + output_layer=self.output_layer, + output_weight=output_weight, + runtime_gather_output=runtime_gather_output, + is_training=self.training, + compute_language_model_loss=self.compute_language_model_loss, + config=self.config, + cp_group=self.pg_collection.cp, + packed_seq_params=packed_seq_params, + ) sequence_parallel_override = False if in_inference_mode and inference_context.config.materialize_only_last_token_logits: @@ -715,27 +678,6 @@ def _postprocess( return loss - def shared_embedding_or_output_weight(self) -> Tensor: - """Gets the embedding weight or output logit weights when share input embedding and - output weights set to True or when use Multi-Token Prediction (MTP) feature. - - Returns: - Tensor: During pre processing or MTP process it returns the input embeddings weight. - Otherwise, during post processing it returns the final output layers weight. - """ - if self.pre_process or self.mtp_process: - # Multi-Token Prediction (MTP) need both embedding layer and output layer. - # So there will be both embedding layer and output layer in the mtp process stage. - # In this case, if share_embeddings_and_output_weights is True, the shared weights - # will be stored in embedding layer, and output layer will not have any weight. - assert hasattr( - self, 'embedding' - ), f"embedding is needed in this pipeline stage, but it is not initialized." - return self.embedding.word_embeddings.weight - elif self.post_process: - return self.output_layer.weight - return None - def build_schedule_plan( self, input_ids: Tensor, @@ -826,20 +768,4 @@ def sharded_state_dict( output_extra_state and output_extra_state.data ), f'Expected output layer extra state to be empty, got: {output_extra_state}' - # Multi-Token Prediction (MTP) need embedding layer in mtp process stage. - # If MTP is not placed in the pre processing stage, we need to maintain a copy of - # embedding layer in the mtp process stage and tie it to the embedding in the pre - # processing stage. - # Now MTP loss is computed in post processing stage, so the output_layer is not needed. - if self.mtp_process and not self.pre_process: - emb_weight_key = f'{prefix}embedding.word_embeddings.weight' - emb_weight = self.embedding.word_embeddings.weight - tie_word_embeddings_state_dict( - sharded_state_dict, - emb_weight, - emb_weight_key, - tp_group=self.tp_group, - dp_cp_group=metadata['dp_cp_group'], - ) - return sharded_state_dict diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py index b87124bab1d..6ca628475be 100755 --- a/megatron/core/models/mamba/mamba_layer_specs.py +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. from megatron.core.extensions.transformer_engine import ( + TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, @@ -19,6 +20,12 @@ from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.multi_token_prediction import ( + MultiTokenPredictionBlock, + MultiTokenPredictionBlockSubmodules, + MultiTokenPredictionLayer, + MultiTokenPredictionLayerSubmodules, +) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import ( MoETransformerLayer, @@ -26,6 +33,7 @@ TransformerLayerSubmodules, ) +# This should be private and should not be used outside of this file. moe = get_moe_module_spec( use_te=True, num_experts=8, # Can be any positive integer (must not be None). @@ -33,6 +41,28 @@ moe_use_legacy_grouped_gemm=False, ) + +# MTP block spec for Mamba - provides norms and projection only. +# Inner layers are built by MultiTokenPredictionLayer using nested MambaStack +_mamba_mtp_block_spec = ModuleSpec( + module=MultiTokenPredictionBlock, + submodules=MultiTokenPredictionBlockSubmodules( + layer_specs=[ + ModuleSpec( + module=MultiTokenPredictionLayer, + submodules=MultiTokenPredictionLayerSubmodules( + enorm=TENorm, + hnorm=TENorm, + eh_proj=TEColumnParallelLinear, + mtp_model_layer=None, # Built via pattern + mamba_submodules + layer_norm=TENorm, + ), + ) + ] + ), +) + + mamba_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -87,9 +117,11 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), + mtp_block_spec=_mamba_mtp_block_spec, ), ) + mamba_inference_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -147,5 +179,6 @@ pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add ), ), + mtp_block_spec=_mamba_mtp_block_spec, ), ) diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 6d43f5583df..8dd614fdaaa 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -16,6 +16,11 @@ from megatron.core.tensor_parallel import gather_from_sequence_parallel_region from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.multi_token_prediction import ( + MultiTokenPredictionBlock, + mtp_on_this_rank, + process_mtp_loss, +) from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.utils import ( WrappedTensor, @@ -38,7 +43,11 @@ class MambaModel(LanguageModule): hybrid_attention_ratio (float, optional): The target ratio of attention layers to total layers hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers - hybrid_override_pattern (str, optional): The hybrid layer pattern to override with + hybrid_override_pattern (str, optional): Unified hybrid layer pattern with optional MTP. + Format: "///..." + Examples: + - "M*M*" -> main decoder only, no MTP + - "M*M*/MM/MM" -> main="M*M*", mtp="MM", 2 depths post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True. fp16_lm_cross_entropy (bool, optional): Defaults to False. @@ -79,6 +88,7 @@ def __init__( scatter_embedding_sequence_parallel: bool = True, seq_len_interpolation_factor: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, + vp_stage: Optional[int] = None, ) -> None: super().__init__(config=config, pg_collection=pg_collection) @@ -97,12 +107,27 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.position_embedding_type = position_embedding_type + self.vp_stage = vp_stage + + # Parse unified pattern to extract main and MTP components + from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern + + parsed = parse_hybrid_pattern(hybrid_override_pattern) + self.mtp_pattern = parsed.mtp_pattern + self.mtp_num_depths = parsed.mtp_num_depths + + # Determine if MTP is needed (based on pattern parsing) + self.mtp_process = ( + self.mtp_pattern is not None + and self.mtp_num_depths > 0 + and mtp_on_this_rank(self.config, vp_stage=self.vp_stage) + ) # megatron core pipelining currently depends on model type # TODO: remove this dependency ? self.model_type = ModelType.encoder_or_decoder - if self.pre_process: + if self.pre_process or self.mtp_process: self.embedding = LanguageModelEmbedding( config=self.config, vocab_size=self.vocab_size, @@ -128,14 +153,33 @@ def __init__( pre_process=self.pre_process, hybrid_attention_ratio=self.hybrid_attention_ratio, hybrid_mlp_ratio=self.hybrid_mlp_ratio, - hybrid_override_pattern=self.hybrid_override_pattern, + hybrid_override_pattern=parsed.main_pattern, post_process=self.post_process, dtype=config.params_dtype, pg_collection=self.pg_collection, ) + # MTP block - uses mtp_block_spec from mamba_stack_spec.submodules + if self.mtp_process: + mamba_submodules = mamba_stack_spec.submodules + mtp_block_spec = mamba_submodules.mtp_block_spec + assert mtp_block_spec is not None, ( + "MTP pattern specified but mtp_block_spec is None in mamba_stack_spec.submodules. " + "Ensure mamba_stack_spec includes mtp_block_spec for MTP support." + ) + + self.mtp = MultiTokenPredictionBlock( + config=self.config, + spec=mtp_block_spec, + pg_collection=self.pg_collection, + vp_stage=self.vp_stage, + mtp_layer_pattern=self.mtp_pattern, + mtp_num_depths=self.mtp_num_depths, + mamba_submodules=mamba_submodules, + ) + # Output - if post_process: + if post_process or self.mtp_process: self.output_layer = tensor_parallel.ColumnParallelLinear( config.hidden_size, self.vocab_size, @@ -149,7 +193,7 @@ def __init__( tp_group=self.pg_collection.tp, ) - if self.pre_process or self.post_process: + if self.pre_process or self.post_process or self.mtp_process: self.setup_embeddings_and_output_layer() for name, module in self.named_modules(): @@ -184,6 +228,7 @@ def forward( runtime_gather_output: Optional[bool] = None, *, inference_params: Optional[BaseInferenceContext] = None, + loss_mask: Optional[Tensor] = None, packed_seq_params: Optional[PackedSeqParams] = None, padding_mask: Optional[Tensor] = None, ) -> Tensor: @@ -258,14 +303,40 @@ def forward( padding_mask=padding_mask, ) - if not self.post_process: - return hidden_states - - # logits and loss output_weight = None if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() + if self.mtp_process: + hidden_states = self.mtp( + input_ids=input_ids, + position_ids=position_ids, + hidden_states=hidden_states, + attention_mask=attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + packed_seq_params=packed_seq_params, + embedding=self.embedding, + ) + + if not self.post_process: + return hidden_states + + if self.config.mtp_num_layers is not None: + hidden_states = process_mtp_loss( + hidden_states=hidden_states, + labels=labels, + loss_mask=loss_mask, + output_layer=self.output_layer, + output_weight=output_weight, + runtime_gather_output=runtime_gather_output, + is_training=self.training, + compute_language_model_loss=self.compute_language_model_loss, + config=self.config, + cp_group=self.pg_collection.cp, + packed_seq_params=packed_seq_params, + ) + sequence_parallel_override = False if in_inference_mode and inference_context.config.materialize_only_last_token_logits: if inference_context.is_static_batching(): @@ -281,7 +352,7 @@ def forward( self.output_layer.sequence_parallel = False sequence_parallel_override = True - # Reshape [B, 1, H] to [1, B, H] → extract each sample’s true last‐token hidden + # Reshape [B, 1, H] to [1, B, H] → extract each sample's true last‐token hidden # state ([B, H]) → unsqueeze back to [B, 1, H] # (so that the output layer, which expects S×B×H, receives only the final token) hidden_states = inference_context.last_token_logits( diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index edca62be375..15c5adfc7a2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -212,7 +212,10 @@ def set_current_microbatch(model, microbatch_id): layer.current_microbatch = microbatch_id if hasattr(model_with_decoder, 'mtp'): for layer in model_with_decoder.mtp.layers: - layer.transformer_layer.current_microbatch = microbatch_id + assert hasattr( + layer, 'mtp_model_layer' + ), f"MTP layer {layer} must have 'mtp_model_layer' attribute" + layer.mtp_model_layer.current_microbatch = microbatch_id def forward_step_calc_loss( diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index 3d684b82dce..48bdba004df 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -42,6 +42,7 @@ class MambaStackSubmodules: attention_layer: Union[ModuleSpec, type] = IdentityOp mlp_layer: Union[ModuleSpec, type] = IdentityOp moe_layer: Union[ModuleSpec, type] = IdentityOp + mtp_block_spec: Optional[ModuleSpec] = None class MambaStack(MegatronModule): @@ -85,12 +86,14 @@ def __init__( device=None, dtype=None, pg_collection: ProcessGroupCollection = None, + is_mtp_layer: bool = False, ) -> None: super().__init__(config=config) self.residual_in_fp32 = residual_in_fp32 self.pre_process = pre_process self.post_layer_norm = post_layer_norm self.post_process = post_process + self.is_mtp_layer = is_mtp_layer assert pg_collection is not None, "pg_collection must be provided for MambaStack" @@ -103,20 +106,32 @@ def __init__( self.hybrid_attention_ratio = hybrid_attention_ratio self.hybrid_mlp_ratio = hybrid_mlp_ratio self.hybrid_override_pattern = hybrid_override_pattern + self.pg_collection = pg_collection + + # For MTP layers, always use pattern length (config.num_layers is for main decoder) + if self.is_mtp_layer: + num_layers_for_allocation = len(self.hybrid_override_pattern) + else: + num_layers_for_allocation = ( + self.config.num_layers + if self.config.num_layers is not None + else len(self.hybrid_override_pattern) + ) self.layer_type_list = allocate_layers( - self.config.num_layers, + num_layers_for_allocation, self.hybrid_attention_ratio, self.hybrid_mlp_ratio, self.hybrid_override_pattern, + silent=self.is_mtp_layer, ) pp_layer_offset = 0 - if self.pp_group.size() > 1: + if self.pp_group.size() > 1 and not self.is_mtp_layer: pp_layer_offset, self.layer_type_list = self._select_layers_for_pipeline_parallel( self.layer_type_list ) - + # Build main decoder layers using shared layer builder self.layers = nn.ModuleList() for i, layer_type in enumerate(self.layer_type_list): fp8_init_context = get_fp8_context(self.config, i + pp_layer_offset, is_init=True) @@ -137,9 +152,10 @@ def __init__( config=self.config, layer_number=i + 1, pg_collection=pg_collection, + is_mtp_layer=is_mtp_layer, ) elif layer_type == LayerSymbols.MLP: - # Transformer layers apply their own pp_layer_offset + # MLP layers apply their own pp_layer_offset layer = build_module( submodules.mlp_layer, config=self.config, @@ -147,7 +163,7 @@ def __init__( pg_collection=pg_collection, ) elif layer_type == LayerSymbols.MOE: - # Transformer layers apply their own pp_layer_offset + # MoE layers apply their own pp_layer_offset layer = build_module( submodules.moe_layer, config=self.config, @@ -347,7 +363,7 @@ def forward( # Ensure that the tensor passed between pipeline parallel stages is # viewless. See related notes in TransformerBlock and TransformerLayer - output = make_viewless_tensor( + hidden_states = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py index fe997e2249a..d7002b2915d 100644 --- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import logging -from typing import Dict, List, Tuple +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple if __name__ != "__main__": from megatron.core.utils import log_single_rank @@ -29,9 +30,129 @@ class Symbols: ATTENTION = "*" MLP = "-" MOE = 'E' + MTP_SEPARATOR = "/" VALID = {MAMBA, ATTENTION, MLP, MOE} +@dataclass +class ParsedHybridPattern: + """Result of parsing a unified hybrid pattern string. + + A unified pattern encodes both the main decoder pattern and the MTP pattern + in a single string using "/" as a separator. + + Format: "///..." + + Examples: + - "M*M*" -> main="M*M*", mtp=None, depths=0 (no MTP) + - "M*M*/MM/MM" -> main="M*M*", mtp="MM", depths=2 + - "MMMM/*M/*M/*M" -> main="MMMM", mtp="*M", depths=3 + + The "/" symbol introduces MTP patterns. Each repeated pattern after the main + decoder represents one MTP prediction depth. + + Attributes: + main_pattern: The main decoder layer pattern (e.g., "M*M*") + mtp_pattern: The MTP layer pattern per depth (e.g., "MM"), or None if no MTP + mtp_num_depths: Number of MTP prediction depths (0 if no MTP) + """ + + main_pattern: Optional[str] + mtp_pattern: Optional[str] + mtp_num_depths: int + + +def parse_hybrid_pattern(pattern: Optional[str]) -> ParsedHybridPattern: + """Parse a unified hybrid pattern string into main and MTP components. + + The pattern uses "/" as a separator between the main decoder pattern and + MTP patterns. Each MTP pattern after the separator represents one prediction + depth. + + Format: "///..." + + Args: + pattern: Unified pattern string, e.g., "M*M*/MM/MM" or just "M*M*" + + Returns: + ParsedHybridPattern with main_pattern, mtp_pattern, and mtp_num_depths + + Raises: + ValueError: If MTP patterns are inconsistent (all must be identical) + ValueError: If pattern contains invalid layer symbols + + Examples: + >>> parse_hybrid_pattern("M*M*") + ParsedHybridPattern(main_pattern="M*M*", mtp_pattern=None, mtp_num_depths=0) + + >>> parse_hybrid_pattern("M*M*/MM/MM") + ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) + + >>> parse_hybrid_pattern("MMMM/*M/*M/*M") + ParsedHybridPattern(main_pattern="MMMM", mtp_pattern="*M", mtp_num_depths=3) + """ + if pattern is None: + return ParsedHybridPattern(main_pattern=None, mtp_pattern=None, mtp_num_depths=0) + + parts = pattern.split(Symbols.MTP_SEPARATOR) + + if len(parts) == 1: + # No MTP separator found - pattern is main decoder only + main_pattern = parts[0] + _validate_pattern(main_pattern, "main") + return ParsedHybridPattern(main_pattern=main_pattern, mtp_pattern=None, mtp_num_depths=0) + + # First part is main decoder pattern + main_pattern = parts[0] + if main_pattern: + _validate_pattern(main_pattern, "main") + + # Remaining parts are MTP patterns (one per depth) + mtp_parts = parts[1:] + + if not mtp_parts or all(p == "" for p in mtp_parts): + # No MTP patterns after separator + return ParsedHybridPattern( + main_pattern=main_pattern if main_pattern else None, mtp_pattern=None, mtp_num_depths=0 + ) + + # Validate all MTP patterns are identical + mtp_pattern = mtp_parts[0] + for i, part in enumerate(mtp_parts[1:], start=2): + if part != mtp_pattern: + raise ValueError( + f"All MTP patterns must be identical. " + f"Pattern 1 is '{mtp_pattern}', but pattern {i} is '{part}'. " + f"Full pattern: '{pattern}'" + ) + + _validate_pattern(mtp_pattern, "MTP") + + return ParsedHybridPattern( + main_pattern=main_pattern if main_pattern else None, + mtp_pattern=mtp_pattern, + mtp_num_depths=len(mtp_parts), + ) + + +def _validate_pattern(pattern: str, pattern_name: str) -> None: + """Validate that a pattern contains only valid layer symbols. + + Args: + pattern: Layer pattern string to validate + pattern_name: Name of pattern for error messages (e.g., "main" or "MTP") + + Raises: + ValueError: If pattern contains invalid symbols + """ + for char in pattern: + if char not in Symbols.VALID: + raise ValueError( + f"In {pattern_name} pattern, '{char}' is not a valid layer symbol. " + f"Valid symbols are: {Symbols.VALID}" + ) + + def _allocate_auto( total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float ) -> list: @@ -97,19 +218,21 @@ def allocate_layers( target_attention_ratio: float, target_mlp_ratio: float, override_pattern: str = None, + silent: bool = False, ) -> list: """Allocates layers according to the requested distribution of layer types.""" assert total_layers_count > 0 assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0 assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0 assert target_attention_ratio + target_mlp_ratio <= 1.0 + maybe_log_single_rank = (lambda *args, **kwargs: None) if silent else log_single_rank # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio) if override_pattern is not None: layer_type_list_override = _allocate_override(total_layers_count, override_pattern) - log_single_rank(logger, logging.INFO, "Using hybrid override pattern") + maybe_log_single_rank(logger, logging.INFO, "Using hybrid override pattern") if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match( layer_type_list_override, layer_type_list ): @@ -119,13 +242,15 @@ def allocate_layers( "pattern." ) if layer_type_list_override == layer_type_list: - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, "The override pattern matches the overridden pattern" ) else: - log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B") - log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") - log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") + maybe_log_single_rank( + logger, logging.INFO, "Warning: overriding pattern A with pattern B" + ) + maybe_log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}") + maybe_log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}") layer_type_list = layer_type_list_override if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None: @@ -134,32 +259,32 @@ def allocate_layers( actual_mlp_layers_count = layer_type_list.count(Symbols.MLP) actual_mlp_ratio = actual_mlp_layers_count / total_layers_count allocation_string = "".join(layer_type_list) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Hybrid allocation ({Symbols.MAMBA} is mamba, " f"{Symbols.ATTENTION} is attention, " f"{Symbols.MLP} is mlp):", ) - log_single_rank(logger, logging.INFO, allocation_string) - log_single_rank( + maybe_log_single_rank(logger, logging.INFO, allocation_string) + maybe_log_single_rank( logger, logging.INFO, f"{actual_attention_layers_count} attention layers in " f"{total_layers_count} total layers.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Target attention ratio: {target_attention_ratio:.2f}. " f"Actual attention ratio: {actual_attention_ratio:.2f}.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.", ) - log_single_rank( + maybe_log_single_rank( logger, logging.INFO, f"Target mlp ratio: {target_mlp_ratio:.2f}. " diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 3643c42c3ce..dd0dad8eba5 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1738,7 +1738,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): callables.append(layer) callables_is_mtp.append(False) for layer_number in range(num_mtp_layers): - layer = chunk_with_decoder.mtp.layers[layer_number].transformer_layer + layer = chunk_with_decoder.mtp.layers[layer_number].mtp_model_layer if _layer_is_graphable(layer, config): num_graphable_layers += 1 callables.append(layer) @@ -1855,7 +1855,7 @@ def _get_layer_static_inputs(layer, chunk_of_the_layer): Get the static inputs for a layer. """ assert layer in chunk_of_the_layer.decoder.layers or any( - layer is mtp_layer.transformer_layer for mtp_layer in chunk_of_the_layer.mtp.layers + layer is mtp_layer.mtp_model_layer for mtp_layer in chunk_of_the_layer.mtp.layers ), "Layer is not in the chunk" def get_rotary_pos_emb(transformer_module, transformer_input): diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index aa5f9658df4..3d9d0b092aa 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -87,10 +87,12 @@ def __init__( config: TransformerConfig, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ): super(BaseMoELayer, self).__init__(config) self.config = config self.layer_number = layer_number + self.is_mtp_layer = is_mtp_layer self.ep_group = pg_collection.ep # use pg_collection.expt_tp_group as tensor parallel group in this module. self.attn_tp_group = pg_collection.tp @@ -140,6 +142,7 @@ def __init__( submodules: Optional[MoESubmodules] = None, layer_number: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ): self.submodules = submodules # TODO(Hepteract): delete the usage of the global parallel_state. @@ -147,7 +150,10 @@ def __init__( if pg_collection is None: pg_collection = get_default_pg_collection() super(MoELayer, self).__init__( - config=config, layer_number=layer_number, pg_collection=pg_collection + config=config, + layer_number=layer_number, + pg_collection=pg_collection, + is_mtp_layer=is_mtp_layer, ) # If using mcore cudagraphs, recompute is handled by transformer_layer.MoETransformerLayer self.moe_layer_recompute = ( @@ -163,7 +169,9 @@ def __init__( self.tp_group = pg_collection.tp # Initialize router. - self.router = submodules.router(config=self.config, pg_collection=pg_collection) + self.router = submodules.router( + config=self.config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer + ) self.tp_group = pg_collection.tp # Initialize latent projections. diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 4be97401748..e42fd1ca8aa 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -29,7 +29,10 @@ class Router(ABC, MegatronModule): """Base Router class""" def __init__( - self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None + self, + config: TransformerConfig, + pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ) -> None: """ Initialize the Router module. @@ -37,12 +40,14 @@ def __init__( Args: config (TransformerConfig): Configuration object for the Transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. + is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ super().__init__(config) self.config = config self.num_experts = self.config.num_moe_experts self.moe_aux_loss_func = None self.layer_number = None + self.is_mtp_layer = is_mtp_layer self.tp_group = pg_collection.tp self.cp_group = pg_collection.cp self.tp_cp_group = pg_collection.tp_cp @@ -145,15 +150,19 @@ class TopKRouter(Router): """ def __init__( - self, config: TransformerConfig, pg_collection: Optional[ProcessGroupCollection] = None + self, + config: TransformerConfig, + pg_collection: Optional[ProcessGroupCollection] = None, + is_mtp_layer: bool = False, ) -> None: """Initialize the zero token dropping router. Args: config (TransformerConfig): The configuration for the transformer model. pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations. + is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer. """ - super().__init__(config=config, pg_collection=pg_collection) + super().__init__(config=config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer) self.topk = self.config.moe_router_topk self.routing_type = self.config.moe_router_load_balancing_type self.score_function = self.config.moe_router_score_function @@ -438,6 +447,16 @@ def attach_and_log_load_balancing_loss( padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). If None, uses activation.shape[0]. Defaults to None. """ + # When using repeated MTP layers, the loss is counted "mtp_num_layers" times. + # To avoid accumulating the load balancing loss multiple times, we scale it by + # 1/mtp_num_layers so the total loss is correct. + if ( + self.is_mtp_layer + and self.config.mtp_use_repeated_layer + and self.config.mtp_num_layers is not None + ): + aux_loss = aux_loss / self.config.mtp_num_layers + # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly # add the aux loss logging value to other layer's since it is difficult to get the # correct layer_number for MTP. It does not affect the correctness of the calculation @@ -445,10 +464,16 @@ def attach_and_log_load_balancing_loss( num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers + + if self.is_mtp_layer: + layer_number = self.layer_number + self.config.num_layers + else: + layer_number = self.layer_number + save_to_aux_losses_tracker( aux_loss_name, aux_loss / aux_loss_coeff, - self.layer_number, + layer_number, num_layers, reduce_group=reduce_group, reduce_group_has_dp=reduce_group_has_dp, @@ -499,11 +524,27 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) + # When using repeated MTP layers, the same MTP layer is called mtp_num_layers times. + # To avoid accumulating the z_loss multiple times, we scale it by 1/mtp_num_layers + # so the total loss is correct. + if ( + self.is_mtp_layer + and self.config.mtp_use_repeated_layer + and self.config.mtp_num_layers is not None + ): + z_loss = z_loss / self.config.mtp_num_layers + num_layers = self.config.num_layers if self.config.mtp_num_layers is not None: num_layers += self.config.mtp_num_layers + + if self.is_mtp_layer: + layer_number = self.layer_number + self.config.num_layers + else: + layer_number = self.layer_number + save_to_aux_losses_tracker( - "z_loss", z_loss / moe_z_loss_coeff, self.layer_number, num_layers + "z_loss", z_loss / moe_z_loss_coeff, layer_number, num_layers ) return logits diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index afd7c0516e1..1c431491ca2 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -14,6 +14,7 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( gather_from_tensor_model_parallel_region, @@ -24,7 +25,6 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_block import TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.transformer.transformer_layer import get_transformer_layer_offset from megatron.core.utils import ( get_pg_rank, is_torch_min_version, @@ -388,7 +388,7 @@ def track_mtp_metrics(loss_scale, iteration, writer, wandb_writer=None, total_lo mtp_losses = tracker["values"] * loss_scale mtp_num_layers = mtp_losses.shape[0] for i in range(mtp_num_layers): - name = f"mtp_{i+1} loss" + name = f"mtp_{i + 1} loss" loss = mtp_losses[i] if total_loss_dict is not None: if name in total_loss_dict: @@ -415,19 +415,19 @@ class MultiTokenPredictionLayerSubmodules: embedding normalization to be applied. eh_proj (Union[ModuleSpec, type]): Specification or instance of the linear projection to be applied. - transformer_layer (Union[ModuleSpec, type]): Specification - or instance of the transformer block to be applied. + mtp_model_layer (Union[ModuleSpec, type]): Specification + or instance of the transformer or mamba block to be applied. """ enorm: Union[ModuleSpec, type] = None hnorm: Union[ModuleSpec, type] = None eh_proj: Union[ModuleSpec, type] = None - transformer_layer: Union[ModuleSpec, type] = None + mtp_model_layer: Union[ModuleSpec, type] = None layer_norm: Union[ModuleSpec, type] = None def get_mtp_layer_spec( - transformer_layer_spec: ModuleSpec, use_transformer_engine: bool + mtp_model_layer_spec: ModuleSpec, use_transformer_engine: bool ) -> ModuleSpec: """Get the MTP layer spec. @@ -435,13 +435,13 @@ def get_mtp_layer_spec( ModuleSpec: Module specification with TE modules """ return get_mtp_layer_spec_for_backend( - transformer_layer_spec, + mtp_model_layer_spec, backend=TESpecProvider() if use_transformer_engine else LocalSpecProvider(), ) def get_mtp_layer_spec_for_backend( - transformer_layer_spec: ModuleSpec, backend: BackendSpecProvider + mtp_model_layer_spec: ModuleSpec, backend: BackendSpecProvider ) -> ModuleSpec: """Get the MTP layer spec. @@ -456,7 +456,7 @@ def get_mtp_layer_spec_for_backend( enorm=layer_norm_impl, hnorm=layer_norm_impl, eh_proj=column_parallel_linear_impl, - transformer_layer=transformer_layer_spec, + mtp_model_layer=mtp_model_layer_spec, layer_norm=layer_norm_impl, ), ) @@ -605,6 +605,79 @@ def set_loss_scale(scale: torch.Tensor): MTPLossAutoScaler.main_loss_backward_scale = scale +def process_mtp_loss( + hidden_states: Tensor, + labels: Tensor, + loss_mask: Optional[Tensor], + output_layer: Callable, + output_weight: Optional[Tensor], + runtime_gather_output: Optional[bool], + is_training: bool, + compute_language_model_loss: Callable, + config: TransformerConfig, + cp_group: Optional[torch.distributed.ProcessGroup] = None, + packed_seq_params: Optional[PackedSeqParams] = None, +) -> Tensor: + """Process Multi-Token Prediction (MTP) loss computation. + + This is a standalone function that handles MTP loss computation. It's used on the + post_process rank to split concatenated hidden states and compute MTP losses. + + Args: + hidden_states (Tensor): Hidden states tensor (concatenated with MTP outputs). + labels (Tensor): Ground truth labels. + loss_mask (Optional[Tensor]): Mask for loss computation. If None, uses all ones. + output_layer (Callable): Output layer method to compute logits. + output_weight (Optional[Tensor]): Optional output weight for shared embeddings. + runtime_gather_output (Optional[bool]): Whether to gather output at runtime. + is_training (bool): Whether the model is in training mode. + compute_language_model_loss (Callable): Method to compute language model loss. + config (TransformerConfig): Model configuration containing mtp_num_layers etc. + cp_group (Optional[ProcessGroup]): Context parallelism process group. + packed_seq_params (Optional[PackedSeqParams]): Packed sequence parameters. + + Returns: + Tensor: Updated hidden states after MTP loss processing (first chunk only). + """ + mtp_labels = labels.clone() + hidden_states_list = torch.chunk(hidden_states, 1 + config.mtp_num_layers, dim=0) + hidden_states = hidden_states_list[0] + + if loss_mask is None: + loss_mask = torch.ones_like(mtp_labels) + + for mtp_layer_number in range(config.mtp_num_layers): + mtp_logits, _ = output_layer( + hidden_states_list[mtp_layer_number + 1], + weight=output_weight, + runtime_gather_output=runtime_gather_output, + ) + mtp_labels, _ = roll_tensor( + mtp_labels, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + loss_mask, num_tokens = roll_tensor( + loss_mask, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + mtp_loss = compute_language_model_loss(mtp_labels, mtp_logits) + mtp_loss = loss_mask * mtp_loss + if is_training: + MTPLossLoggingHelper.save_loss_to_tracker( + torch.sum(mtp_loss) / num_tokens, + mtp_layer_number, + config.mtp_num_layers, + avg_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + ) + mtp_loss_scale = config.mtp_loss_scaling_factor / config.mtp_num_layers + if config.calculate_per_token_loss: + hidden_states = MTPLossAutoScaler.apply(hidden_states, mtp_loss_scale * mtp_loss) + else: + hidden_states = MTPLossAutoScaler.apply( + hidden_states, mtp_loss_scale * mtp_loss / num_tokens + ) + + return hidden_states + + class MultiTokenPredictionLayer(MegatronModule): """The implementation for Multi-Token Prediction (MTP) which extends the prediction scope to multiple future tokens at each position. @@ -632,6 +705,9 @@ def __init__( layer_number: int = 1, vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + # For Mamba path - pattern and submodules to build inner layers directly + mtp_layer_pattern: Optional[str] = None, + mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.sequence_parallel = config.sequence_parallel @@ -639,14 +715,31 @@ def __init__( self.layer_number = layer_number + get_mtp_layer_offset(self.config, vp_stage) self.vp_stage = vp_stage self.cp_group = pg_collection.cp + self.mtp_layer_pattern = mtp_layer_pattern - self_attention_spec = self.submodules.transformer_layer.submodules.self_attention - attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') - assert attn_mask_type in SUPPORTED_ATTN_MASK, ( - f"Multi-Token Prediction (MTP) is not jet supported with " - + f"{attn_mask_type} attention mask type." - + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." - ) + # Validate attention mask type if using transformer-based inner layers + if self.submodules.mtp_model_layer is not None and hasattr( + self.submodules.mtp_model_layer, 'submodules' + ): + if hasattr(self.submodules.mtp_model_layer.submodules, 'attention_layer'): + self_attention_spec = self.submodules.mtp_model_layer.submodules.attention_layer + if self_attention_spec.submodules.self_attention is not None: + self_attention_spec = self_attention_spec.submodules.self_attention + attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') + assert attn_mask_type in SUPPORTED_ATTN_MASK, ( + f"Multi-Token Prediction (MTP) is not yet supported with " + f"{attn_mask_type} attention mask type. " + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." + ) + elif hasattr(self.submodules.mtp_model_layer.submodules, 'self_attention'): + self_attention_spec = self.submodules.mtp_model_layer.submodules.self_attention + if self_attention_spec is not None: + attn_mask_type = self_attention_spec.params.get('attn_mask_type', '') + assert attn_mask_type in SUPPORTED_ATTN_MASK, ( + f"Multi-Token Prediction (MTP) is not yet supported with " + f"{attn_mask_type} attention mask type. " + f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." + ) self.enorm = build_module( self.submodules.enorm, @@ -677,17 +770,37 @@ def __init__( bias=False, skip_bias_add=False, is_expert=False, + tp_comm_buffer_name="mtp_eh_proj", ) - diff_transformer_layer_offset = self.config.num_layers - get_transformer_layer_offset( - self.config, vp_stage - ) - self.transformer_layer = build_module( - self.submodules.transformer_layer, - config=self.config, - vp_stage=vp_stage, - layer_number=self.layer_number + diff_transformer_layer_offset, - ) + # Build inner layers: two possible paths + # 1. Mamba path: use MambaStack for hybrid pattern support + # 2. GPT path: single TransformerLayer + if mtp_layer_pattern is not None and mamba_submodules is not None: + from megatron.core.ssm.mamba_block import MambaStack + + self.mtp_model_layer = MambaStack( + config=self.config, + submodules=mamba_submodules, + hybrid_override_pattern=mtp_layer_pattern, + pre_process=True, # Always receives input from eh_proj + post_layer_norm=False, # MTP has its own final_layernorm + post_process=True, # MTP layer is self-contained + pg_collection=pg_collection, + is_mtp_layer=True, + ) + elif self.config.mtp_num_layers is not None: + # GPT path: Uses the transformer block spec for MTP layer + # MTP inner layers use their own layer numbering (self.layer_number = 1, 2, etc.) + # rather than continuing from decoder layer numbers. This is consistent with the + # Mamba path and ensures proper aux loss tracking in router.py. + self.mtp_model_layer = build_module( + self.submodules.mtp_model_layer, + config=self.config, + vp_stage=self.vp_stage, + layer_number=self.layer_number, + is_mtp_layer=True, + ) self.final_layernorm = build_module( self.submodules.layer_norm, @@ -798,7 +911,6 @@ def _proj_and_transformer_layer( transformer_layer_fp8_context = nullcontext() # TODO: currently ignoring FP4 in MTP layers because we need more numerical validation - with rng_context: with fp8_context: hidden_states = self._concat_embeddings(hidden_states, decoder_input) @@ -807,19 +919,29 @@ def _proj_and_transformer_layer( # transformer layer is cudagraphed, the FP8GlobalStateManager.is_first_fp8_module() is # True so that the fp8 weight caching can be triggered correctly. with transformer_layer_fp8_context: - hidden_states, _ = self.transformer_layer( - hidden_states=hidden_states, - attention_mask=attention_mask, - context=context, - context_mask=context_mask, - rotary_pos_emb=rotary_pos_emb, - rotary_pos_cos=rotary_pos_cos, - rotary_pos_sin=rotary_pos_sin, - attention_bias=attention_bias, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - sequence_len_offset=sequence_len_offset, - ) + if self.mtp_layer_pattern is not None: + hidden_states = self.mtp_model_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + inference_context=inference_params, + packed_seq_params=packed_seq_params, + ) + else: + # GPT path: single TransformerLayer + hidden_states, _ = self.mtp_model_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + attention_bias=attention_bias, + inference_params=inference_params, + packed_seq_params=packed_seq_params, + sequence_len_offset=sequence_len_offset, + ) hidden_states = self._postprocess(hidden_states) @@ -916,8 +1038,7 @@ def forward( Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape [s, b, h], and optionally the updated context tensor if cross-attention is used. """ - assert context is None, f"multi token prediction + cross attention is not yet supported." - + assert context is None, "multi token prediction + cross attention is not yet supported." input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings( input_ids=input_ids, position_ids=position_ids, @@ -1041,6 +1162,9 @@ class MultiTokenPredictionBlock(MegatronModule): the linear projection. The combined serves as the input of the Transformer block at the k-th depth to produce the output representation. + When `mtp_use_repeated_layer=True` in config, instead of creating N separate MTP layers, + only 1 layer is created and applied mtp_num_layers times. + for more information, please refer to DeepSeek-V3 Technical Report https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf """ @@ -1051,11 +1175,26 @@ def __init__( spec: Union[TransformerBlockSubmodules, ModuleSpec], vp_stage: Optional[int] = None, pg_collection: Optional[ProcessGroupCollection] = None, + # New: For Mamba path with unified pattern syntax + mtp_layer_pattern: Optional[str] = None, + mtp_num_depths: int = 0, + mamba_submodules: Optional["MambaStackSubmodules"] = None, ): super().__init__(config=config) self.submodules = _get_mtp_block_submodules(config, spec) self.mtp_loss_scaling_factor = config.mtp_loss_scaling_factor self.vp_stage = vp_stage + self.mtp_layer_pattern = mtp_layer_pattern + self.mtp_num_depths = mtp_num_depths + self.mamba_submodules = mamba_submodules + self.mtp_use_repeated_layer = self.config.mtp_use_repeated_layer + + vp_size = config.virtual_pipeline_model_parallel_size + assert is_vp_last_stage(vp_stage=vp_stage, vp_size=vp_size), ( + f"MTP layers must be placed on the last virtual pipeline stage. " + f"Got vp_stage={vp_stage} with vp_size={vp_size}. " + f"Placing MTP layers on different VPP stages is not currently supported." + ) # Initialize Context Parallelism (CP) support for MTP # This enables MTP to work with CP > 1 by providing the CP process group @@ -1074,7 +1213,14 @@ def __init__( self.cp_group = pg_collection.cp def _build_layers(self, pg_collection): - def build_layer(layer_spec, layer_number): + # Determine number of depths to build + if self.mtp_num_depths > 0: + num_depths = self.mtp_num_depths + else: + num_depths = self.config.mtp_num_layers or len(self.submodules.layer_specs) + + def build_layer_legacy(layer_spec, layer_number): + """Build layer using legacy spec-based approach.""" fp8_init_context = get_fp8_context(self.config, is_init=True) with fp8_init_context: module = build_module( @@ -1083,15 +1229,71 @@ def build_layer(layer_spec, layer_number): layer_number=layer_number, vp_stage=self.vp_stage, pg_collection=pg_collection, + mtp_layer_pattern=self.mtp_layer_pattern, ) return module - self.layers = torch.nn.ModuleList( - [ - build_layer(layer_spec, i + 1) - for i, layer_spec in enumerate(self.submodules.layer_specs) - ] - ) + def build_layer_with_pattern(layer_spec, layer_number, mtp_layer_pattern, mamba_submodules): + """Build layer using pattern-based approach (new Mamba path).""" + fp8_init_context = get_fp8_context(self.config, is_init=True) + with fp8_init_context: + module = build_module( + layer_spec, + config=self.config, + layer_number=layer_number, + vp_stage=self.vp_stage, + pg_collection=pg_collection, + mtp_layer_pattern=mtp_layer_pattern, + mamba_submodules=mamba_submodules, + ) + return module + + # New Mamba path: use mtp_layer_pattern and mamba_submodules + if self.mtp_layer_pattern is not None and self.mamba_submodules is not None: + if self.mtp_use_repeated_layer: + # Shared/repeated layer: build one layer, use it for all depths + layer_spec = self.submodules.layer_specs[0] + shared_layer = build_layer_with_pattern( + layer_spec, + layer_number=1, + mtp_layer_pattern=self.mtp_layer_pattern, + mamba_submodules=self.mamba_submodules, + ) + self.layers = torch.nn.ModuleList([shared_layer]) + else: + # Non-shared: each depth gets its own layers + self.layers = torch.nn.ModuleList( + [ + build_layer_with_pattern( + self.submodules.layer_specs[ + min(i, len(self.submodules.layer_specs) - 1) + ], + layer_number=i + 1, + mtp_layer_pattern=self.mtp_layer_pattern, + mamba_submodules=self.mamba_submodules, + ) + for i in range(num_depths) + ] + ) + elif self.mtp_use_repeated_layer: + # Legacy repeated layer mode + if len(self.submodules.layer_specs) != 1: + warnings.warn( + "Repeated MTP mode expects exactly 1 layer spec, got " + f"{len(self.submodules.layer_specs)} instead. " + f"The first layer will be applied {self.config.mtp_num_layers} times." + ) + self.layers = torch.nn.ModuleList( + [build_layer_legacy(self.submodules.layer_specs[0], layer_number=1)] + ) + else: + # Legacy mode: build from layer_specs + self.layers = torch.nn.ModuleList( + [ + build_layer_legacy(layer_spec, i + 1) + for i, layer_spec in enumerate(self.submodules.layer_specs) + ] + ) def forward( self, @@ -1127,8 +1329,9 @@ def forward( offset = get_mtp_layer_offset(self.config, self.vp_stage) hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] - for layer_number in range(len(self.layers)): - (hidden_states, input_ids, position_ids) = self.layers[layer_number]( + for iteration in range(self.config.mtp_num_layers): + layer_idx = 0 if self.mtp_use_repeated_layer else iteration + (hidden_states, input_ids, position_ids) = self.layers[layer_idx]( input_ids=input_ids, position_ids=position_ids, hidden_states=hidden_states, @@ -1170,7 +1373,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' for layer in self.layers: offset = get_mtp_layer_offset(self.config, self.vp_stage) - sharded_prefix = f'{layer_prefix}{layer.layer_number - 1 }.' + sharded_prefix = f'{layer_prefix}{layer.layer_number - 1}.' state_dict_prefix = f'{layer_prefix}{layer.layer_number - 1 - offset}.' sharded_pp_offset = [] diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 48b04c35134..5648657d466 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -59,6 +59,15 @@ class TransformerConfig(ModelParallelConfig): which serves as an additional training objective. """ + mtp_use_repeated_layer: bool = False + """Use a single MTP layer repeatedly instead of multiple separate layers.""" + + mtp_hybrid_override_pattern: Optional[str] = None + """DEPRECATED: Use unified hybrid_override_pattern instead. + Legacy argument for loading old checkpoints. + Force a specific hybrid layer pattern for MTP layers. + """ + num_layers_in_first_pipeline_stage: Optional[int] = None """Number of transformer layers on first pipeline stage. None implies equal layer division across PP ranks.""" diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 9a3b69e8a77..855f8fe48ae 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -268,6 +268,7 @@ def __init__( hidden_dropout: Optional[float] = None, pg_collection: Optional[ProcessGroupCollection] = None, vp_stage: Optional[int] = None, + is_mtp_layer: bool = False, ): self.submodules_config = submodules super().__init__(config=config, vp_stage=vp_stage) @@ -277,10 +278,18 @@ def __init__( self.pg_collection = pg_collection self.tp_group = pg_collection.tp - self.layer_number = layer_number + get_transformer_layer_offset( - self.config, vp_stage, get_pg_rank(pg_collection.pp) - ) + # MTP inner layers use their own layer numbering (starting from 1 within each MTP depth), + # so they should NOT add the decoder layer offset. The router.py handles MTP layer + # numbering separately by adding config.num_layers to distinguish MTP layers from decoder + # layers in the aux loss tracker. + if is_mtp_layer: + self.layer_number = layer_number + else: + self.layer_number = layer_number + get_transformer_layer_offset( + self.config, vp_stage, get_pg_rank(pg_collection.pp) + ) self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout + self.is_mtp_layer = is_mtp_layer # [Module 1: Input Layernorm] Optional Layernorm on the input data # TODO: add pytorch only layernorm @@ -351,6 +360,9 @@ def __init__( if isinstance(submodules.mlp, ModuleSpec): if submodules.mlp.module in (MoELayer, GroupedMLP, TEGroupedMLP, SequentialMLP): additional_mlp_kwargs["pg_collection"] = pg_collection + # Pass is_mtp_layer flag to MoELayer to distinguish MTP MoE layers. + if submodules.mlp.module == MoELayer: + additional_mlp_kwargs["is_mtp_layer"] = self.is_mtp_layer elif submodules.mlp.module == MLP: assert hasattr( pg_collection, 'tp' diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 9951203f18f..e4ac9c3f15c 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -504,6 +504,79 @@ def validate_args(args, defaults={}): print_rank_0('setting global batch size to {}'.format(args.global_batch_size)) assert args.global_batch_size > 0 + # === MTP validation === + # Deprecation warnings for legacy MTP arguments + if args.mtp_hybrid_override_pattern is not None: + warn_rank_0( + "--mtp-hybrid-override-pattern is deprecated. " + "For new hybrid models with MTP models, use unified --hybrid-override-pattern instead. " + "Example: 'M*M*/MM/MM' means main='M*M*', MTP pattern='MM' with 2 depths. " + "This argument is kept only for loading old checkpoints.", + args.rank, + ) + + # Backward compatibility: convert legacy mtp_hybrid_override_pattern to unified format + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, parse_hybrid_pattern + sep = Symbols.MTP_SEPARATOR + if ( + getattr(args, 'mtp_hybrid_override_pattern', None) is not None + and args.mtp_num_layers is not None + and args.mtp_num_layers > 0 + and (args.hybrid_override_pattern is None or sep not in args.hybrid_override_pattern) + ): + main_pattern = args.hybrid_override_pattern or '' + mtp_pattern = args.mtp_hybrid_override_pattern + args.hybrid_override_pattern = main_pattern + sep + sep.join([mtp_pattern] * args.mtp_num_layers) + args.mtp_hybrid_override_pattern = None + print_rank_0(f"Converted legacy MTP pattern to unified: {args.hybrid_override_pattern}") + + # Infer mtp_num_layers from unified pattern + if args.hybrid_override_pattern and sep in args.hybrid_override_pattern: + parsed = parse_hybrid_pattern(args.hybrid_override_pattern) + if parsed.mtp_pattern and parsed.mtp_num_depths > 0: + inferred_mtp_num_layers = parsed.mtp_num_depths + if args.mtp_num_layers is None: + args.mtp_num_layers = inferred_mtp_num_layers + elif args.mtp_num_layers != inferred_mtp_num_layers: + warn_rank_0( + f"--mtp-num-layers ({args.mtp_num_layers}) conflicts with " + f"MTP depth count ({inferred_mtp_num_layers}) in pattern '{args.hybrid_override_pattern}'. " + f"Using the inferred value ({inferred_mtp_num_layers}).", + args.rank + ) + args.mtp_num_layers = inferred_mtp_num_layers + + # MTP validation + if args.mtp_num_layers: + assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." + assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( + f"Multi-Token Prediction (MTP) is not supported with {args.position_embedding_type} position embedding type." + + f"The supported position embedding types are rope and none." + ) + + # Validate MTP args for hybrid vs non-hybrid models + if args.is_hybrid_model: + # Mamba/hybrid model MTP validation + if args.mtp_num_layers and not (args.hybrid_override_pattern and sep in args.hybrid_override_pattern): + # Hybrid model wants MTP but no unified pattern - check for legacy args + if args.mtp_hybrid_override_pattern is None: + warn_rank_0( + "Hybrid model with --mtp-num-layers but no MTP pattern. " + "Use unified --hybrid-override-pattern with '/' separator (e.g., 'M*M*/MM/MM') " + "or legacy --mtp-hybrid-override-pattern for old checkpoints.", + args.rank + ) + else: + # Non-hybrid (GPT) model MTP validation + if args.mtp_hybrid_override_pattern is not None: + warn_rank_0( + "--mtp-hybrid-override-pattern is for Mamba/hybrid models only. " + "For GPT models, MTP replicates the main transformer layer structure. " + "This argument will be ignored.", + args.rank + ) + # === End of MTP validation === + # Uneven virtual pipeline parallelism assert ( int(args.num_layers_per_virtual_pipeline_stage is not None) diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index a3d307f1e30..f964b8dd32e 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1418,6 +1418,12 @@ def _set_arg(arg_name, old_arg_name=None, force=False): _set_arg('hidden_dropout', force=True) _set_arg('hybrid_override_pattern', force=True) + + # Legacy MTP pattern for old checkpoints + _set_arg('mtp_hybrid_override_pattern', force=True) + _set_arg('mtp_num_layers', force=True) + _set_arg('mtp_use_repeated_layer', force=True) + _set_arg('spec', force=True) _set_arg('hybrid_attention_ratio', force=True) _set_arg('hybrid_mlp_ratio', force=True) diff --git a/megatron/training/training.py b/megatron/training/training.py index 563c228367f..02599d99ea6 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -227,10 +227,20 @@ def num_floating_point_operations(args, batch_size): def calculate_layer_counts(): """Calculate the number of attention, Mamba, and MLP layers.""" if args.hybrid_override_pattern: - counts = {'M': 0, '*': 0, '-': 0, 'E':0} - for layer_type in args.hybrid_override_pattern: - if layer_type in counts: - counts[layer_type] += 1 + from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern + # Parse unified pattern to separate main and MTP components + parsed = parse_hybrid_pattern(args.hybrid_override_pattern) + counts = {'M': 0, '*': 0, '-': 0, 'E': 0} + # Count main decoder layers + if parsed.main_pattern: + for layer_type in parsed.main_pattern: + if layer_type in counts: + counts[layer_type] += 1 + # Count MTP layers (pattern repeated mtp_num_depths times) + if parsed.mtp_pattern and parsed.mtp_num_depths > 0: + for layer_type in parsed.mtp_pattern: + if layer_type in counts: + counts[layer_type] += parsed.mtp_num_depths return counts['*'], counts['M'], counts['-'], counts['E'] else: num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio) @@ -307,7 +317,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, mlp_expansion=4.0, swiglu=False, moe_latent_size=None, moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1, - vocab_size=256000): + vocab_size=256000, mtp_num_layers=0): """Calculate total FLOPs for the hybrid model.""" flops_fwd = ( num_attn_layers * attn_layer_flops(batch_size, seq_len, hidden_size, @@ -320,7 +330,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size, num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, shared_expert_ffn_hidden_size, num_experts_routed_to, moe_latent_size, swiglu) + - (2 * batch_size * seq_len * hidden_size * vocab_size) # logits computation + (2 * batch_size * seq_len * hidden_size * vocab_size * (1 + mtp_num_layers)) # logits computation ) return flops_fwd * 3 @@ -599,6 +609,9 @@ def transformer_flops(): # Calculate the number of each type of layer. num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers = calculate_layer_counts() + mtp_num_layers = args.mtp_num_layers + if mtp_num_layers is None: + mtp_num_layers = 0 # Compute hybrid model FLOPs. return hybrid_flops( batch_size=batch_size, @@ -625,6 +638,7 @@ def transformer_flops(): else args.moe_shared_expert_intermediate_size), num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, + mtp_num_layers=mtp_num_layers, ) else: # Compute standard Transformer model FLOPs. diff --git a/pretrain_mamba.py b/pretrain_mamba.py index e1379be63e9..c41c485c866 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -257,6 +257,7 @@ def forward_step(data_iterator, model: MambaModel): attention_mask, labels=labels, packed_seq_params=packed_seq_params, + loss_mask=loss_mask ) # [ModelOpt]: model is needed to access ModelOpt distillation losses diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py index a5590a0ffad..f933d811779 100644 --- a/tests/unit_tests/models/test_mamba_moe_model.py +++ b/tests/unit_tests/models/test_mamba_moe_model.py @@ -194,9 +194,11 @@ "moe_z_loss_coeff": None, "moe_enable_routing_replay": False, "mrope_section": None, + "mtp_hybrid_override_pattern": None, "mtp_loss_scaling_factor": 0.1, "mtp_num_layers": None, "mtp_standalone": False, + "mtp_use_repeated_layer": False, "multi_latent_attention": False, "nccl_all_reduce_for_prefill": False, "no_rope_freq": None, diff --git a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py index 77d02c69607..77c106c3bee 100644 --- a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py +++ b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py @@ -6,7 +6,12 @@ import pytest import torch -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, allocate_layers +from megatron.core.ssm.mamba_hybrid_layer_allocation import ( + ParsedHybridPattern, + Symbols, + allocate_layers, + parse_hybrid_pattern, +) @pytest.mark.internal @@ -75,3 +80,135 @@ def test_wrong_length_override_pattern(self): def test_wrong_number_of_layer_types_in_override_pattern(self): # This override_pattern has too many mlps and not enough attention layer_types = allocate_layers(8, 0.5, 0.25, "M*--M**-") + + +@pytest.mark.internal +class TestParseHybridPattern: + """Tests for parse_hybrid_pattern with unified pattern syntax.""" + + def test_none_pattern(self): + """Test that None pattern returns all None values.""" + result = parse_hybrid_pattern(None) + assert result.main_pattern is None + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_main_pattern_only(self): + """Test patterns without MTP (no / separator).""" + test_cases = [ + ("M*M*", "M*M*"), + ("MMMM", "MMMM"), + ("*M*M", "*M*M"), + ("MM-*", "MM-*"), + ("E", "E"), + ] + for pattern, expected_main in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_main_with_single_mtp_depth(self): + """Test patterns with 1 MTP depth.""" + test_cases = [ + ("M*M*/MM", "M*M*", "MM", 1), + ("MMMM/*M", "MMMM", "*M", 1), + ("M/M", "M", "M", 1), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_main_with_multiple_mtp_depths(self): + """Test patterns with multiple MTP depths.""" + test_cases = [ + ("M*M*/MM/MM", "M*M*", "MM", 2), + ("M*M*/MM/MM/MM", "M*M*", "MM", 3), + ("MMMM/*M/*M/*M", "MMMM", "*M", 3), + ("M*/*/*/*", "M*", "*", 3), + ("M/M/M/M/M", "M", "M", 4), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_mtp_patterns_must_be_identical(self): + """Test that mismatched MTP patterns raise ValueError.""" + invalid_patterns = [ + "M*M*/MM/M*", # MM != M* + "M*M*/MM/MM/M", # MM != M + "MMMM/*M/M*", # *M != M* + ] + for pattern in invalid_patterns: + with pytest.raises(ValueError, match="All MTP patterns must be identical"): + parse_hybrid_pattern(pattern) + + def test_invalid_symbols_in_main_pattern(self): + """Test that invalid symbols in main pattern raise ValueError.""" + invalid_patterns = [ + "M*X*", # X is not valid + "MaMM", # a is not valid + "M*M*1", # 1 is not valid + ] + for pattern in invalid_patterns: + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern(pattern) + + def test_invalid_symbols_in_mtp_pattern(self): + """Test that invalid symbols in MTP pattern raise ValueError.""" + # Single MTP depth with invalid symbol - should raise "not a valid layer symbol" + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern("M*M*/MX") # X is not valid + + # Multiple MTP depths with invalid symbol and matching patterns + with pytest.raises(ValueError, match="not a valid layer symbol"): + parse_hybrid_pattern("M*M*/Ma/Ma") # a is not valid + + # Multiple MTP depths with invalid symbol but mismatched patterns + # This raises "All MTP patterns must be identical" before checking symbols + with pytest.raises(ValueError, match="All MTP patterns must be identical"): + parse_hybrid_pattern("M*M*/MM/Ma") + + def test_empty_main_pattern_with_mtp(self): + """Test pattern that starts with / (empty main pattern).""" + result = parse_hybrid_pattern("/MM/MM") + assert result.main_pattern is None + assert result.mtp_pattern == "MM" + assert result.mtp_num_depths == 2 + + def test_trailing_separator(self): + """Test patterns with trailing separator.""" + # "M*M*/" means main="M*M*", one empty MTP pattern + result = parse_hybrid_pattern("M*M*/") + assert result.main_pattern == "M*M*" + # Empty string after separator means no valid MTP pattern + assert result.mtp_pattern is None + assert result.mtp_num_depths == 0 + + def test_complex_patterns(self): + """Test more complex realistic patterns.""" + test_cases = [ + # Main decoder with attention, MTP with mamba only + ("M*M*M*M*/MMM/MMM", "M*M*M*M*", "MMM", 2), + # Main decoder with MLP, MTP with attention+mamba + ("MM-MM-/*M/*M", "MM-MM-", "*M", 2), + # All attention main, mamba MTP + ("*****/M/M/M/M", "*****", "M", 4), + # MoE in main pattern + ("MEME/MM/MM", "MEME", "MM", 2), + ] + for pattern, expected_main, expected_mtp, expected_depths in test_cases: + result = parse_hybrid_pattern(pattern) + assert result.main_pattern == expected_main, f"Failed for pattern: {pattern}" + assert result.mtp_pattern == expected_mtp, f"Failed for pattern: {pattern}" + assert result.mtp_num_depths == expected_depths, f"Failed for pattern: {pattern}" + + def test_dataclass_equality(self): + """Test that ParsedHybridPattern supports equality comparison.""" + p1 = parse_hybrid_pattern("M*M*/MM/MM") + p2 = ParsedHybridPattern(main_pattern="M*M*", mtp_pattern="MM", mtp_num_depths=2) + assert p1 == p2 diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py index 05fb2c4fe63..ec72d713eb1 100644 --- a/tests/unit_tests/transformer/test_multi_token_prediction.py +++ b/tests/unit_tests/transformer/test_multi_token_prediction.py @@ -13,6 +13,8 @@ get_gpt_mtp_block_spec, ) from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec +from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import get_context_parallel_group @@ -94,7 +96,7 @@ def test_constructor_local(self, tp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].transformer_layer is not None + assert mtp.layers[i].mtp_model_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -120,7 +122,7 @@ def test_constructor_ues_te(self, tp, cp): assert mtp.layers[i].hnorm.weight.shape[0] == config.hidden_size assert mtp.layers[i].eh_proj.weight.shape[0] == config.hidden_size / tp assert mtp.layers[i].eh_proj.weight.shape[1] == config.hidden_size * 2 - assert mtp.layers[i].transformer_layer is not None + assert mtp.layers[i].mtp_model_layer is not None num_weights = sum([p.numel() for p in mtp.parameters()]) if tp == 1: assert num_weights == 58560 * config.mtp_num_layers @@ -162,7 +164,7 @@ def model_provider( config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=args.vocal_size, + vocab_size=args.vocab_size, max_sequence_length=args.max_position_embeddings, pre_process=pre_process, post_process=post_process, @@ -186,7 +188,7 @@ def create_test_args( args.num_layers = 2 args.mtp_num_layers = 2 args.mtp_loss_scaling_factor = 0.1 - args.vocal_size = 128800 + args.vocab_size = 128800 args.hidden_size = 128 args.num_attention_heads = 8 args.max_position_embeddings = 256 @@ -677,10 +679,259 @@ def log(self, metrics, iteration): # Verify total_loss_dict is populated for i in range(num_layers): - assert f"mtp_{i+1} loss" in total_loss_dict - assert total_loss_dict[f"mtp_{i+1} loss"] == loss * loss_scale + assert f"mtp_{i + 1} loss" in total_loss_dict + assert total_loss_dict[f"mtp_{i + 1} loss"] == loss * loss_scale # Verify tracker is cleaned assert torch.all(MTPLossLoggingHelper.tracker["values"] == 0) assert MTPLossLoggingHelper.tracker["reduce_group"] is None assert MTPLossLoggingHelper.tracker["avg_group"] is None + + +class TestMultiTokenPredictionMamba: + """Test Multi-Token Prediction with Mamba hybrid models.""" + + def setup_method(self, method): + self.seq_length = 32 + self.micro_batch_size = 2 + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + + def teardown_method(self, method): + Utils.destroy_model_parallel() + destroy_global_vars() + destroy_num_microbatches_calculator() + MTPLossLoggingHelper.tracker = {} + + def model_provider(self, pre_process=True, post_process=True, **config_kwargs): + """Model provider for Mamba hybrid models with MTP. + + Uses the unified pattern syntax where MTP is configured via hybrid_override_pattern: + Format: "///..." + Example: "M*M*/M*/M*" = main decoder "M*M*", MTP pattern "M*" with 2 depths + """ + model_parallel_cuda_manual_seed(_SEED) + args = get_args() + config = core_transformer_config_from_args(args) + + # MTP is configured via unified pattern in hybrid_override_pattern + # MambaModel creates the MTP block internally based on the parsed pattern + model = MambaModel( + config=config, + mamba_stack_spec=mamba_stack_spec, + vocab_size=args.vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + hybrid_attention_ratio=args.hybrid_attention_ratio, + hybrid_mlp_ratio=args.hybrid_mlp_ratio, + hybrid_override_pattern=args.hybrid_override_pattern, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + ) + return model + + def create_test_args( + self, tp, cp, sequence_length, micro_batch_size, fp8=None, full_recompute=False + ): + destroy_global_vars() + destroy_num_microbatches_calculator() + + sys.argv = ['test_multi_token_prediction_mamba.py'] + args = parse_args() + args.num_layers = 4 + args.mtp_num_layers = 2 + args.mtp_loss_scaling_factor = 0.1 + args.vocab_size = 128800 + args.hidden_size = 128 + args.num_attention_heads = 8 + args.num_query_groups = 8 + args.mamba_num_groups = 4 + args.max_position_embeddings = 256 + args.micro_batch_size = micro_batch_size + args.create_attention_mask_in_dataloader = True + args.seq_length = sequence_length + args.tensor_model_parallel_size = tp + args.sequence_parallel = True if tp > 1 else False + args.context_parallel_size = cp + args.position_embedding_type = 'rope' + args.train_iters = 1 + args.ckpt_format = 'torch_dist' + args.lr = 3e-5 + args.attention_dropout = 0.0 + args.hidden_dropout = 0.0 + args.async_tensor_model_parallel_allreduce = False + args.no_save_optim = True + args.no_load_optim = True + args.no_load_rng = True + args.bf16 = True + args.hybrid_attention_ratio = 0.5 + args.hybrid_mlp_ratio = 0.0 + # Unified pattern: "main/mtp/mtp" - main decoder "M*M*", MTP pattern "M*" with 2 depths + args.hybrid_override_pattern = "M*M*/M*/M*" + args.spec = "megatron.core.models.mamba.mamba_layer_specs.mamba_stack_spec" + + if fp8 is not None: + args.fp8 = 'e4m3' + if full_recompute: + args.recompute_granularity = 'full' + args.recompute_method = 'uniform' + args.recompute_num_layers = 1 + else: + args.recompute_granularity = None + args.add_bias_linear = False + args.swiglu = True + + validate_args(args) + set_global_variables(args, False) + return args + + def get_batch(self, seq_length, micro_batch_size): + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones( + (micro_batch_size, 1, seq_length, seq_length), dtype=bool + ).cuda() + loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + batch = { + 'tokens': input_ids, + 'labels': labels, + 'loss_mask': loss_mask, + 'attention_mask': attention_mask, + 'position_ids': position_ids, + } + return batch + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) + def test_sharded_state_dict_mamba(self, tp, cp): + """Test MTP with Mamba hybrid model - sharded state dict.""" + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) + mamba_model = unwrap_model(mamba_model) + sharded_state_dict = mamba_model[0].sharded_state_dict() + + # Verify MTP layers are in the state dict + for i in range(args.mtp_num_layers): + assert f"mtp.layers.{i}.enorm.weight" in sharded_state_dict.keys() + assert f"mtp.layers.{i}.hnorm.weight" in sharded_state_dict.keys() + assert f"mtp.layers.{i}.eh_proj.weight" in sharded_state_dict.keys() + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1)]) + def test_forward_backward_mamba(self, tmp_path_dist_ckpt, tp, cp): + """Test MTP forward and backward with Mamba hybrid model.""" + tp_ref = 1 + cp_ref = 1 + args = self.create_test_args(tp_ref, cp_ref, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_ref, context_parallel_size=cp_ref + ) + batch = self.get_batch(self.seq_length, self.micro_batch_size) + tokens, labels, loss_mask, attention_mask, position_ids = batch.values() + + mamba_model_ref, optimizer, opt_param_scheduler = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + + output_ref = mamba_model_ref[0].forward( + input_ids=tokens, + position_ids=position_ids, + attention_mask=attention_mask, + labels=labels, + loss_mask=loss_mask, + ) + tracker = MTPLossLoggingHelper.tracker + mtp_loss_ref = None + assert "values" in tracker + mtp_loss_ref = tracker['values'].clone() + MTPLossLoggingHelper.clean_loss_in_tracker() + + iteration = 123 + num_floating_point_operations_so_far = 456 + + def set_ckpt_path(ckpt_path): + args.save = ckpt_path + args.load = ckpt_path + + with TempNamedDir(tmp_path_dist_ckpt / 'test_mtp_mamba_model_reconfiguration') as ckpt_dir: + set_ckpt_path(ckpt_dir) + save_checkpoint( + iteration, + mamba_model_ref, + optimizer, + opt_param_scheduler, + num_floating_point_operations_so_far, + ) + + expected_ckpt_path = args.save / "iter_0000123" / ".metadata" + assert os.path.exists(expected_ckpt_path) + + Utils.destroy_model_parallel() + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + set_ckpt_path(ckpt_dir) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + mamba_model, optimizer, opt_param_scheduler = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + load_checkpoint(mamba_model, optimizer, opt_param_scheduler, strict=False) + + batch["output_ref"] = output_ref + batch = get_batch_on_this_cp_rank(batch) + tokens, labels, loss_mask, attention_mask, position_ids, output_ref = batch.values() + output = mamba_model[0].forward( + input_ids=tokens, + position_ids=position_ids, + attention_mask=attention_mask, + labels=labels, + loss_mask=loss_mask, + ) + tracker = MTPLossLoggingHelper.tracker + assert "values" in tracker + mtp_loss = tracker['values'].clone() + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['cp']) + torch.distributed.all_reduce( + mtp_loss, group=pg_collection.cp, op=torch.distributed.ReduceOp.AVG + ) + MTPLossLoggingHelper.clean_loss_in_tracker() + assert torch.allclose(output_ref, output, rtol=1e-03, atol=1e-03) + assert torch.allclose(mtp_loss, mtp_loss_ref, rtol=1e-02, atol=1e-02) + + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length / cp + + loss = output.mean() + loss.backward() + for name, param in mamba_model[0].named_parameters(): + assert param.main_grad is not None + + @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") + def test_attention_mask_validation_mamba(self): + """Test that attention mask type validation works for Mamba hybrid models.""" + tp = 1 + cp = 1 + args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) + set_args(args) + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + try: + mamba_model = get_model(self.model_provider, ModelType.encoder_or_decoder) + mamba_model = unwrap_model(mamba_model) + assert isinstance(mamba_model[0], MambaModel) + assert mamba_model[0].mtp is not None + except AssertionError as e: + if "Multi-Token Prediction (MTP) is not yet supported" in str(e): + pytest.fail(f"Attention mask validation failed for Mamba hybrid model: {e}") + else: + raise From b043863b54fa55d5d4687ec5530e2489affeca61 Mon Sep 17 00:00:00 2001 From: Philip Petrakian Date: Wed, 4 Feb 2026 16:58:57 -0700 Subject: [PATCH 061/231] Fix uv install for GH actions (#3259) --- .github/actions/action.yml | 4 ++-- .github/workflows/oncall-rotation.yml | 5 ++++- .github/workflows/sync-team-usergroups.yml | 5 ++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 895b6863bef..088877304a7 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -117,7 +117,7 @@ runs: export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) export NCCL_DEBUG=INFO - pip install --no-cache-dir "uv!=0.9.29" + pip install --no-cache-dir "uv<0.9.29" uv venv .venv uv cache clean uv sync --no-cache --only-group test @@ -199,7 +199,7 @@ runs: export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) - pip install --no-cache-dir "uv!=0.9.29" + pip install --no-cache-dir "uv<0.9.29" uv venv .venv uv cache clean uv sync --no-cache --only-group test diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml index 46a45810ad1..71ae094e6c8 100644 --- a/.github/workflows/oncall-rotation.yml +++ b/.github/workflows/oncall-rotation.yml @@ -45,7 +45,10 @@ jobs: # Slack token for updating the Slack usergroup SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }} run: | - pip install --no-cache-dir uv + pip install --no-cache-dir "uv<0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate - name: Commit and Push changes diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml index 8b08182dceb..1c6cecaeb7a 100644 --- a/.github/workflows/sync-team-usergroups.yml +++ b/.github/workflows/sync-team-usergroups.yml @@ -35,5 +35,8 @@ jobs: GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }} SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }} run: | - pip install --no-cache-dir uv + pip install --no-cache-dir "uv<0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py From dd7d14178028eb93954812ca44b1faa44161fb65 Mon Sep 17 00:00:00 2001 From: janEbert Date: Thu, 5 Feb 2026 01:06:56 +0000 Subject: [PATCH 062/231] Update the project structure in README (#3251) Co-authored-by: Xin Yao --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3551e74762c..f9bb1412c31 100644 --- a/README.md +++ b/README.md @@ -77,12 +77,12 @@ Megatron-LM/ │ │ ├── distributed/ # Distributed training (FSDP, DDP) │ │ ├── optimizer/ # Optimizers │ │ ├── datasets/ # Dataset loaders -│ │ ├── inference/ # Inference engines +│ │ ├── inference/ # Inference engines and server │ │ └── export/ # Model export (e.g. TensorRT-LLM) │ ├── training/ # Training scripts -│ ├── inference/ # Inference server │ ├── legacy/ # Legacy components -│ └── post_training/ # Post-training (RLHF, etc.) +│ ├── post_training/ # Post-training (quantization, distillation, pruning, etc.) +│ └── rl/ # Reinforcement learning (RLHF, etc.) ├── examples/ # Ready-to-use training examples ├── tools/ # Utility tools ├── tests/ # Comprehensive test suite From 1f6d8c22403d105c90b40d1f4240dee76848b2f7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 5 Feb 2026 01:39:07 +0000 Subject: [PATCH 063/231] chore: rotate oncall schedule --- .github/oncall_schedule.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json index 5fa49e966bc..3284549a296 100644 --- a/.github/oncall_schedule.json +++ b/.github/oncall_schedule.json @@ -1,8 +1,4 @@ [ - { - "user": "dimapihtar", - "date": "2026-01-28" - }, { "user": "gautham-kollu", "date": "2026-02-04" @@ -46,5 +42,9 @@ { "user": "BoxiangW", "date": "2026-04-15" + }, + { + "user": "Phlip79", + "date": "2026-04-22" } ] From 1b110768dc0d890a61cc8416b1f8f02e42930111 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Thu, 5 Feb 2026 12:24:59 +0800 Subject: [PATCH 064/231] Cherry-pick: Fix mtp_num_layers and clip_qk issues (#2581, #2776) (#3075) Co-authored-by: rj42 Co-authored-by: Xin Yao Co-authored-by: Juntao Wang --- megatron/core/models/gpt/gpt_model.py | 2 +- megatron/core/optimizer/qk_clip.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index cbd1985002d..b0d3f085240 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -608,7 +608,7 @@ def _postprocess( if not self.post_process: return hidden_states - if self.config.mtp_num_layers is not None: + if self.config.mtp_num_layers: hidden_states = process_mtp_loss( hidden_states=hidden_states, labels=labels, diff --git a/megatron/core/optimizer/qk_clip.py b/megatron/core/optimizer/qk_clip.py index 72127f94712..26b5787cd50 100644 --- a/megatron/core/optimizer/qk_clip.py +++ b/megatron/core/optimizer/qk_clip.py @@ -22,6 +22,11 @@ def clip_qk(model, log_max_only=False) -> float: for model_chunk in model: for transformer_layer in model_chunk.module.module.decoder.layers: if hasattr(transformer_layer.self_attention, 'clip_qk'): + if ( + transformer_layer.self_attention.core_attention.current_max_attn_logits + is None + ): + continue torch.distributed.all_reduce( transformer_layer.self_attention.core_attention.current_max_attn_logits, op=torch.distributed.ReduceOp.MAX, From 111a2a060418dbf94ed00c13fab4df123d15f85a Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Thu, 5 Feb 2026 09:26:47 -0500 Subject: [PATCH 065/231] RL: training cudagraphs functional test (#3235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: oliver könig --- .../env_config.yaml | 5 + .../golden_values_dev_dgx_h100.json | 83 ++++++++++++++ .../model_config.yaml | 103 ++++++++++++++++++ tests/test_utils/recipes/h100/gpt-grpo.yaml | 5 + 4 files changed, 196 insertions(+) create mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/env_config.yaml create mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/model_config.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/env_config.yaml new file mode 100644 index 00000000000..329246987bf --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/env_config.yaml @@ -0,0 +1,5 @@ +- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent + agent_args: + dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4" + split: "train" + weight: 1.0 diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..42c13292446 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/golden_values_dev_dgx_h100.json @@ -0,0 +1,83 @@ +{ + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 20, + "step_interval": 1, + "values": { + "1": 60922068992.0, + "2": 60922068992.0, + "3": 60922331136.0, + "4": 60922073088.0, + "5": 60922331136.0, + "6": 60922064896.0, + "7": 60922331136.0, + "8": 60922064896.0, + "9": 60922322944.0, + "10": 60922052608.0, + "11": 60922056704.0, + "12": 60922318848.0, + "13": 60922056704.0, + "14": 60922318848.0, + "15": 60922056704.0, + "16": 60922310656.0, + "17": 60922052608.0, + "18": 60922052608.0, + "19": 60922048512.0, + "20": 60922044416.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 20, + "step_interval": 1, + "values": { + "1": 60922073088.0, + "2": 64156037120.0, + "3": 64156037120.0, + "4": 64156041216.0, + "5": 64156041216.0, + "6": 64156041216.0, + "7": 64156041216.0, + "8": 64156041216.0, + "9": 64156041216.0, + "10": 64156041216.0, + "11": 64156041216.0, + "12": 64156041216.0, + "13": 64156041216.0, + "14": 64156041216.0, + "15": 64156041216.0, + "16": 64156041216.0, + "17": 64156041216.0, + "18": 64156041216.0, + "19": 64156041216.0, + "20": 64156041216.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 20, + "step_interval": 1, + "values": { + "1": "nan", + "2": 60.37194, + "3": 13.25967, + "4": 13.01461, + "5": 14.04256, + "6": 13.53259, + "7": 13.3335, + "8": 12.72344, + "9": 13.64787, + "10": 12.66485, + "11": 13.15779, + "12": 13.01275, + "13": 12.72481, + "14": 12.67697, + "15": 12.7286, + "16": 12.65032, + "17": 12.86279, + "18": 12.71745, + "19": 13.4137, + "20": 12.75566 + } + } +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/model_config.yaml new file mode 100644 index 00000000000..46b0474056f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/model_config.yaml @@ -0,0 +1,103 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + THROUGHPUT_START_STEP: 10 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEAT: 1 +TEST_TYPE: frozen-start +MODE: rl +MODEL_ARGS: + --tensor-model-parallel-size: 4 + --inference-dynamic-batching-num-cuda-graphs: 1 + --inference-dynamic-batching-unified-memory-level: 1 + --inference-dynamic-batching-buffer-size-gb: 20 + --ckpt-format: torch_dist + --seq-length: 1024 + --inference-max-seq-length: 1024 + --load: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist + --untie-embeddings-and-output-weights: true + --num-layers: 36 + --hidden-size: 4096 + --ffn-hidden-size: 12288 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 1024 + --group-query-attention: true + --num-query-groups: 8 + --normalization: RMSNorm + --norm-epsilon: 0.000001 + --qk-layernorm: true + --position-embedding-type: rope + --rotary-percent: 1.0 + --rotary-base: 1000000 + --use-rotary-position-embeddings: true + --swiglu: true + --disable-bias-linear: true + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --no-masked-softmax-fusion: true + --attention-softmax-in-fp32: true + --tokenizer-type: HuggingFaceTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer + --langrl-inference-server-type: inplace_megatron_chat + --langrl-inference-server-conversation-template: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer + --vocab-size: 151936 + --make-vocab-size-divisible-by: 128 + --optimizer: adam + --adam-beta1: 0.9 + --adam-beta2: 0.999 + --adam-eps: 0.00000001 + --lr: 0.000001 + --min-lr: 0.0000001 + --lr-warmup-samples: 0 + --clip-grad: 1.0 + --weight-decay: 0.01 + --deterministic-mode: true + --use-mcore-models: true + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-option: minmax + --log-throughput: true + --no-create-attention-mask-in-dataloader: true + --straggler-minmax-count: 16 + --tensorboard-log-interval: 1 + --log-interval: 1 + --log-progress: true + --empty-unused-memory-level: 2 + --seed: 42 + --calculate-per-token-loss: true + --rl-use-sequence-packing: true + --rl-sequence-packing-algo: fifo + --rl-offload-optimizer-during-inference: true + --timing-log-level: 1 + --cuda-graph-impl: local + --micro-batch-size: 1 + --global-batch-size: 4 + --grpo-group-size: 2 + --grpo-prompts-per-step: 2 + --grpo-iterations: 1 + --grpo-clamp-eps-lower: 0.2 + --grpo-clamp-eps-upper: 0.2 + --grpo-kl-beta: 0.0 + --grpo-entropy-term-weight: 0.0 + --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/env_config.yaml + --rl-partial-rollouts: true + --use-checkpoint-args: true + --dist-ckpt-strictness: log_unexpected + --perform-rl-step: true + --train-samples: 48828125 + --exit-interval: 20 + --tensorboard-dir: ${TENSORBOARD_PATH} + --save-interval: 1000000 + --eval-interval: 1000000 + --finetune: true + --inference-logging-step-interval: 1 + --rl-training-cuda-graphs: true +METRICS: + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "iteration-time" diff --git a/tests/test_utils/recipes/h100/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml index faaccee73dd..500c19b4f6e 100644 --- a/tests/test_utils/recipes/h100/gpt-grpo.yaml +++ b/tests/test_utils/recipes/h100/gpt-grpo.yaml @@ -59,6 +59,11 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] + - test_case: [gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] - test_case: [gpt_grpo_tp4_pp1_dp2_8b_throughput_github] products: - environment: [dev] From 1934391a9578f7108be7bda63f73887dfb6f0dfa Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Thu, 5 Feb 2026 23:06:50 +0800 Subject: [PATCH 066/231] [Main] fix cg missing wgrad hook (#3074) Co-authored-by: Philip Petrakian --- megatron/training/arguments.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index e4ac9c3f15c..c057e55597d 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1291,6 +1291,25 @@ def validate_args(args, defaults={}): assert is_te_min_version("2.8.0"), ( "overlap_grad_reduce is only supported with TE >= 2.8.0 when enabling delay_wgrad_compute" ) + wgrad_in_graph_scope = CudaGraphScope.attn in args.cuda_graph_scope or ( + CudaGraphScope.moe_router in args.cuda_graph_scope + and args.moe_shared_expert_intermediate_size is not None + and not args.moe_shared_expert_overlap + ) + if wgrad_in_graph_scope: + assert is_te_min_version( + "2.12.0" + ), "CUDA graph with delay_wgrad_compute requires TE version >= 2.12.0." + assert args.gradient_accumulation_fusion, ( + 'CUDA graph with delay_wgrad_compute requires gradient_accumulation_fusion ' + 'to be enabled. This is because the default gradient accumulation does not ' + 'use static memory addresses, which breaks CUDA graph requirements.' + ) + if CudaGraphScope.attn in args.cuda_graph_scope: + assert ( + not args.add_bias_linear and not args.add_qkv_bias + ), "CUDA graph with delay_wgrad_compute doesn't support attn bias for now." + if not args.gradient_accumulation_fusion: assert is_te_min_version("2.7.0"), ( "disabling gradient_accumulation_fusion is only supported with TE >= 2.7.0 " From 801f12f7d6d69c6efa6259e99c7a1c7143c4b7c6 Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Thu, 5 Feb 2026 16:47:15 -0500 Subject: [PATCH 067/231] Avoid .cuda call on meta device in LanguageModel (#3202) Co-authored-by: Chris Grimm Co-authored-by: Xin Yao --- megatron/core/models/common/language_module/language_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index 57975b2958b..b5afd06d6c8 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -240,7 +240,7 @@ def setup_embeddings_and_output_layer(self) -> None: # Ensure that first and last stages have the same initial parameter # values. if torch.distributed.is_initialized(): - if self._is_in_embd_group(): + if self._is_in_embd_group() and not self.config.init_model_with_meta_device: weight = self.shared_embedding_or_output_weight() weight.data = weight.data.cuda() torch.distributed.all_reduce(weight.data, group=self.embd_group) From 3c0a4f3410a2b94368fcfa340553a2f683a9db57 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 6 Feb 2026 00:13:27 +0000 Subject: [PATCH 068/231] Update copy-pr-bot.yaml [skip ci] --- .github/copy-pr-bot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 8998eabe3a2..0f6915cc542 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] From 0434f87427407d33292a3721983b83485c9f5ff9 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:10:48 +0200 Subject: [PATCH 069/231] fix checkpointing error message (#3203) Signed-off-by: dimapihtar --- .../core/dist_checkpointing/validation.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py index 48f2bda8737..0e5d6a011a8 100644 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -433,19 +433,25 @@ def validate_sharding_integrity( for rank, rank_shardings in enumerate(global_metadata): for sharding in rank_shardings: key_shardings[sharding.key].append((rank, sharding)) + errors = [] for key, shardings in key_shardings.items(): if isinstance(shardings[0][1], ShardedObject): - _validate_objects_for_key(shardings) + errors.extend(_validate_objects_for_key(shardings)) else: - _validate_sharding_for_key(shardings) + errors.extend(_validate_sharding_for_key(shardings)) + if errors: + errors = '\n'.join(str(e) for e in errors) + raise CheckpointingException(f'Invalid sharding pattern validation. Errors: {errors}') -def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): + +def _validate_sharding_for_key( + rank_sharding: List[Tuple[int, ShardedTensor]] +) -> List[CheckpointingException]: some_rank_shard = rank_sharding[0][1] global_shape = some_rank_shard.global_shape local_shape = some_rank_shard.local_shape dtype = some_rank_shard.dtype - has_flattened_range = some_rank_shard.flattened_range is not None has_regular_sharding_grid = some_rank_shard.has_regular_grid for rank, sharding in rank_sharding: assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard) @@ -465,16 +471,21 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): some_rank_shard, ) + errors = [] if not has_regular_sharding_grid: # In case of uneven sharding we defer the validation to DCP - return + return errors shard_access_cnt = _compute_shards_access(rank_sharding) if not torch.all(shard_access_cnt == 1): - raise CheckpointingException( - f"Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}" + errors.append( + CheckpointingException( + f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}' + ) ) + return errors + def _compute_shards_access(rank_sharding): shard_access_cnt = torch.zeros( @@ -486,20 +497,24 @@ def _compute_shards_access(rank_sharding): return shard_access_cnt -def _validate_objects_for_key(sharded_objects: List[ShardedObject]): +def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> List[CheckpointingException]: """Ensure uniqueness of saved objects.""" unique_keys = [ sh_obj.unique_key for _, sh_obj in sharded_objects if is_main_replica(sh_obj.replica_id) ] + errors = [] if len(unique_keys) != len(set(unique_keys)): duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1} logger.error(f"Duplicate ShardedObject keys and counts: {duplicates}") - raise CheckpointingException(f"Duplicate ShardedObject keys: {list(duplicates.keys())}") + errors.append( + CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}') + ) expected_shard_num = np.prod(sharded_objects[0][1].global_shape) if len(unique_keys) != expected_shard_num: err_msg = f"Invalid access pattern: {expected_shard_num - len(unique_keys)} ShardedObject are missing." logger.error(f"{err_msg} Existing shards: {unique_keys}") - raise CheckpointingException(err_msg) + errors.append(CheckpointingException(err_msg)) + return errors def determine_global_metadata( From 347ad215a8ca2f46c9a599666b03465c475bf4eb Mon Sep 17 00:00:00 2001 From: "Chenhan D. Yu" <5185878+ChenhanYu@users.noreply.github.com> Date: Thu, 5 Feb 2026 15:11:40 -0800 Subject: [PATCH 070/231] Nano QAT/D fix with sft tokenizer and datasets (#3254) --- examples/post_training/modelopt/train.sh | 93 +++++++++++++++++++ .../core/distributed/finalize_model_grads.py | 6 +- megatron/post_training/model_builder.py | 9 +- megatron/training/tokenizer/sft_tokenizer.py | 26 +++++- 4 files changed, 125 insertions(+), 9 deletions(-) create mode 100755 examples/post_training/modelopt/train.sh diff --git a/examples/post_training/modelopt/train.sh b/examples/post_training/modelopt/train.sh new file mode 100755 index 00000000000..1ebb8bf3d76 --- /dev/null +++ b/examples/post_training/modelopt/train.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +# Common arguments and base model specific arguments +source "${SCRIPT_DIR}/conf/arguments.sh" + + +# Set up cache dir for HF to avoid out of space error +export HF_DATASETS_CACHE="/tmp/hf_datasets_cache" + +# Extra arguments of this script +MLM_DEFAULT_ARGS=" \ + --modelopt-enabled \ + --distributed-timeout-minutes 60 \ + --auto-detect-ckpt-format \ + --export-te-mcore-model \ +" + + +if [ -z ${MLM_MODEL_SAVE} ]; then + MLM_MODEL_SAVE=${MLM_MODEL_CKPT} + printf "${MLM_WARNING} Variable ${PURPLE}MLM_MODEL_SAVE${WHITE} is not set (default: ${MLM_MODEL_CKPT})!\n" +fi + +if [ -z ${MLM_DATA_ARGS} ]; then + MLM_DATA_ARGS=" \ + --train-samples 128000 \ + --lr-decay-samples 128000 \ + --lr-warmup-samples 0 \ + --sft \ + --tokenizer-type SFTTokenizer \ + --per-split-data-args-path ${BLEND_PATH} \ + " +fi + +if [ -z ${MLM_TRAIN_ARGS} ]; then + MLM_TRAIN_ARGS=" \ + --no-gradient-accumulation-fusion \ + --micro-batch-size 1 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --no-check-for-nan-in-loss-and-grad \ + " +fi + +if [ -z ${MLM_OPTIM_ARGS} ]; then + MLM_OPTIM_ARGS=" \ + --lr 5.0e-5 \ + --min-lr 1.0e-7 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 0.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.010 \ + --use-distributed-optimizer \ + " +fi + +if [ -z ${MLM_EVAL_ARGS} ]; then + MLM_EVAL_ARGS=" \ + --eval-iters 1 \ + --eval-interval 1000 \ + --save-interval 1000 \ + --log-interval 100 \ + " +fi + +export HF_TOKEN=${HF_TOKEN} + +if [[ ${MODEL_ARGS} == *"MambaModel"* ]]; then + PRETRAIN_EXE=${SCRIPT_DIR}/../../../pretrain_mamba.py +else + PRETRAIN_EXE=${SCRIPT_DIR}/../../../pretrain_gpt.py +fi + +${LAUNCH_SCRIPT} ${PRETRAIN_EXE} \ + ${MODEL_ARGS} \ + --tensor-model-parallel-size ${TP} \ + --expert-tensor-parallel-size ${ETP} \ + --expert-model-parallel-size ${EP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${MLM_MODEL_CKPT} \ + --save ${MLM_MODEL_SAVE} \ + ${MLM_DATA_ARGS} \ + ${MLM_OPTIM_ARGS} \ + ${MLM_TRAIN_ARGS} \ + ${MLM_EVAL_ARGS} \ + ${MLM_RESUME_ARGS} \ + ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py index a52592bb269..ca6bdd354ce 100644 --- a/megatron/core/distributed/finalize_model_grads.py +++ b/megatron/core/distributed/finalize_model_grads.py @@ -299,7 +299,11 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer expert_bias_list = [] for model_chunk in model: for module in get_attr_wrapped_model(model_chunk, 'modules')(): - if hasattr(module, 'expert_bias'): + # Only update expert_bias if this module is in the training mode. There are special + # cases where only the student is in training mode but the teacher is in eval mode + # when using online knoweldge-distillation with Model-Optimizer. In this case, we want + # to avoid updating teacher's expert_bias. + if hasattr(module, 'expert_bias') and module.training: tokens_per_expert_list.append(module.local_tokens_per_expert) expert_bias_list.append(module.expert_bias) # For hybrid models with both MoE and Dense layers, this list can be empty. diff --git a/megatron/post_training/model_builder.py b/megatron/post_training/model_builder.py index fea837c96c3..56e72bd925b 100644 --- a/megatron/post_training/model_builder.py +++ b/megatron/post_training/model_builder.py @@ -135,7 +135,7 @@ def _load_teacher_model(config, config_raw: Namespace, model_kwargs: Dict[str, A local_core_attention=False if config.context_parallel_size > 1 else args.export_force_local_attention, remap_te_layernorm=args.export_te_mcore_model, real_quant_cfg=args.export_real_quant_cfg, - use_arbitrary_attention_mask=False if config.context_parallel_size > 1 else True, + use_arbitrary_attention_mask=False, ) teacher = MCoreGPTModel(config=config, **model_kwargs) _add_load_convert_hooks(teacher) @@ -226,21 +226,18 @@ def modelopt_gpt_mamba_builder( use_te=args.transformer_impl == "transformer_engine", ) else: - local_core_attention=args.export_force_local_attention if config.context_parallel_size > 1: print_rank_0("context_parallel_size > 1! Force using TEDotProductAttention!") local_core_attention=False - print_rank_0("context_parallel_size > 1! Force attention_mask_type to Causal. This can be wrong for EAGLE training!") - use_arbitrary_attention_mask = False else: - use_arbitrary_attention_mask = True + local_core_attention=args.export_force_local_attention transformer_layer_spec = get_gpt_modelopt_spec( config=config, local_core_attention=local_core_attention, remap_te_layernorm=args.export_te_mcore_model, real_quant_cfg=args.export_real_quant_cfg, - use_arbitrary_attention_mask=use_arbitrary_attention_mask, + use_arbitrary_attention_mask=False, ) model_kwargs = { diff --git a/megatron/training/tokenizer/sft_tokenizer.py b/megatron/training/tokenizer/sft_tokenizer.py index 274c6f6c944..d4dcb2aecbf 100644 --- a/megatron/training/tokenizer/sft_tokenizer.py +++ b/megatron/training/tokenizer/sft_tokenizer.py @@ -6,6 +6,7 @@ nemotron_h_aligned_custom_template = """{% for message in messages %}{% if message['role'] == 'system' %}{{ 'System\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'user' %}{{ 'User\n' + message['content'].strip() + '\n' + 'Assistant\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() + '\n' }}{% endif %}{% endfor %}""" nemotron_nano_v2_custom_template = """{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'system' %}{{ 'System\n' + content.replace('/think', '').replace('/no_think', '').strip() + '\n' }}{% elif message['role'] == 'user' %}{{ 'User\n' + content.replace('/think', '').replace('/no_think', '').strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant\n' + content.strip() + '\n\n' }}{% endif %}{% endfor %}""" +identity_template = """{% for message in messages %}{{ message['content'] }}{% endfor %}""" from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer from megatron.training.datasets.sft_dataset import IGNORE_INDEX @@ -58,6 +59,22 @@ def __init__( has_bos=False, has_system_role=True, ) + elif prompt_format == "identity": + self._prompt_config = PromptConfig( + assistant_prefix_len=0, + pad_token_id=tokenizer.convert_tokens_to_ids(""), + custom_chat_template=identity_template, + has_bos=False, + has_system_role=True, + ) + elif prompt_format == "default": + self._prompt_config = PromptConfig( + assistant_prefix_len=0, + pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id, + custom_chat_template=tokenizer.chat_template, + has_bos=tokenizer.bos_token_id is not None, + has_system_role=True, + ) else: raise NotImplementedError("unknown SFT prompt format", prompt_format) @@ -98,6 +115,11 @@ def tokenize_conversation( target = tokens.copy() + # When using the default prompt format, we do not replace any tokens with IGNORE_INDEX. + # Instead, all token losses will be used for simplicity. + if self._prompt_format == "default": + return tokens, target + # Mask system and user tokens in the target. idx = 0 for turn_idx, turn in enumerate(conversation): @@ -105,7 +127,7 @@ def tokenize_conversation( if turn["role"].lower() == "assistant" and len(turn["content"]) == 0: raise ValueError(f"empty assistant turn in conversation: {conversation}.") if turn["role"].lower() == "assistant": - assert conversation[turn_idx-1]["role"].lower() == "user" + assert conversation[turn_idx-1]["role"].lower() in ("user", "tool") turn_tokens = self._tokenizer.apply_chat_template( [turn], tokenize=True, chat_template=self._prompt_config.custom_chat_template @@ -118,7 +140,7 @@ def tokenize_conversation( turn_len = len(turn_tokens) role = turn["role"].lower() - if role in ("system", "user"): + if role in ("system", "user", "tool"): target[idx : idx + turn_len] = IGNORE_INDEX elif role == "assistant": if self._prompt_config.assistant_prefix_len > 0: From 8379d4318d3cad2d87ae5adeb17cd2bc52d5ef16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 6 Feb 2026 11:51:35 +0100 Subject: [PATCH 071/231] Revert "fix checkpointing error message (#3203)" (#3283) --- .../core/dist_checkpointing/validation.py | 35 ++++++------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py index 0e5d6a011a8..48f2bda8737 100644 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -433,25 +433,19 @@ def validate_sharding_integrity( for rank, rank_shardings in enumerate(global_metadata): for sharding in rank_shardings: key_shardings[sharding.key].append((rank, sharding)) - errors = [] for key, shardings in key_shardings.items(): if isinstance(shardings[0][1], ShardedObject): - errors.extend(_validate_objects_for_key(shardings)) + _validate_objects_for_key(shardings) else: - errors.extend(_validate_sharding_for_key(shardings)) + _validate_sharding_for_key(shardings) - if errors: - errors = '\n'.join(str(e) for e in errors) - raise CheckpointingException(f'Invalid sharding pattern validation. Errors: {errors}') - -def _validate_sharding_for_key( - rank_sharding: List[Tuple[int, ShardedTensor]] -) -> List[CheckpointingException]: +def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): some_rank_shard = rank_sharding[0][1] global_shape = some_rank_shard.global_shape local_shape = some_rank_shard.local_shape dtype = some_rank_shard.dtype + has_flattened_range = some_rank_shard.flattened_range is not None has_regular_sharding_grid = some_rank_shard.has_regular_grid for rank, sharding in rank_sharding: assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard) @@ -471,21 +465,16 @@ def _validate_sharding_for_key( some_rank_shard, ) - errors = [] if not has_regular_sharding_grid: # In case of uneven sharding we defer the validation to DCP - return errors + return shard_access_cnt = _compute_shards_access(rank_sharding) if not torch.all(shard_access_cnt == 1): - errors.append( - CheckpointingException( - f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}' - ) + raise CheckpointingException( + f"Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}" ) - return errors - def _compute_shards_access(rank_sharding): shard_access_cnt = torch.zeros( @@ -497,24 +486,20 @@ def _compute_shards_access(rank_sharding): return shard_access_cnt -def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> List[CheckpointingException]: +def _validate_objects_for_key(sharded_objects: List[ShardedObject]): """Ensure uniqueness of saved objects.""" unique_keys = [ sh_obj.unique_key for _, sh_obj in sharded_objects if is_main_replica(sh_obj.replica_id) ] - errors = [] if len(unique_keys) != len(set(unique_keys)): duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1} logger.error(f"Duplicate ShardedObject keys and counts: {duplicates}") - errors.append( - CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}') - ) + raise CheckpointingException(f"Duplicate ShardedObject keys: {list(duplicates.keys())}") expected_shard_num = np.prod(sharded_objects[0][1].global_shape) if len(unique_keys) != expected_shard_num: err_msg = f"Invalid access pattern: {expected_shard_num - len(unique_keys)} ShardedObject are missing." logger.error(f"{err_msg} Existing shards: {unique_keys}") - errors.append(CheckpointingException(err_msg)) - return errors + raise CheckpointingException(err_msg) def determine_global_metadata( From e2e5a6a11bcc517ebd9490f0df32686aaee19d78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 6 Feb 2026 12:30:25 +0100 Subject: [PATCH 072/231] Reapply "fix checkpointing error message (#3203)" (#3283) (#3285) --- .../core/dist_checkpointing/validation.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py index 48f2bda8737..0e5d6a011a8 100644 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -433,19 +433,25 @@ def validate_sharding_integrity( for rank, rank_shardings in enumerate(global_metadata): for sharding in rank_shardings: key_shardings[sharding.key].append((rank, sharding)) + errors = [] for key, shardings in key_shardings.items(): if isinstance(shardings[0][1], ShardedObject): - _validate_objects_for_key(shardings) + errors.extend(_validate_objects_for_key(shardings)) else: - _validate_sharding_for_key(shardings) + errors.extend(_validate_sharding_for_key(shardings)) + if errors: + errors = '\n'.join(str(e) for e in errors) + raise CheckpointingException(f'Invalid sharding pattern validation. Errors: {errors}') -def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): + +def _validate_sharding_for_key( + rank_sharding: List[Tuple[int, ShardedTensor]] +) -> List[CheckpointingException]: some_rank_shard = rank_sharding[0][1] global_shape = some_rank_shard.global_shape local_shape = some_rank_shard.local_shape dtype = some_rank_shard.dtype - has_flattened_range = some_rank_shard.flattened_range is not None has_regular_sharding_grid = some_rank_shard.has_regular_grid for rank, sharding in rank_sharding: assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard) @@ -465,16 +471,21 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): some_rank_shard, ) + errors = [] if not has_regular_sharding_grid: # In case of uneven sharding we defer the validation to DCP - return + return errors shard_access_cnt = _compute_shards_access(rank_sharding) if not torch.all(shard_access_cnt == 1): - raise CheckpointingException( - f"Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}" + errors.append( + CheckpointingException( + f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}' + ) ) + return errors + def _compute_shards_access(rank_sharding): shard_access_cnt = torch.zeros( @@ -486,20 +497,24 @@ def _compute_shards_access(rank_sharding): return shard_access_cnt -def _validate_objects_for_key(sharded_objects: List[ShardedObject]): +def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> List[CheckpointingException]: """Ensure uniqueness of saved objects.""" unique_keys = [ sh_obj.unique_key for _, sh_obj in sharded_objects if is_main_replica(sh_obj.replica_id) ] + errors = [] if len(unique_keys) != len(set(unique_keys)): duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1} logger.error(f"Duplicate ShardedObject keys and counts: {duplicates}") - raise CheckpointingException(f"Duplicate ShardedObject keys: {list(duplicates.keys())}") + errors.append( + CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}') + ) expected_shard_num = np.prod(sharded_objects[0][1].global_shape) if len(unique_keys) != expected_shard_num: err_msg = f"Invalid access pattern: {expected_shard_num - len(unique_keys)} ShardedObject are missing." logger.error(f"{err_msg} Existing shards: {unique_keys}") - raise CheckpointingException(err_msg) + errors.append(CheckpointingException(err_msg)) + return errors def determine_global_metadata( From a116ce3f86582f1e85b2db61757e7c75e41b3d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 6 Feb 2026 17:08:51 +0100 Subject: [PATCH 073/231] docs: Add changelog for 0.15.3 (#3286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index babdc18b8a4..478ee56ada0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +## NVIDIA Megatron Core 0.15.3 + +This release addresses known security issues. For the latest NVIDIA Vulnerability Disclosure Information visit , for acknowledgement please reach out to the NVIDIA PSIRT team at + +## NVIDIA Megatron Core 0.15.2 + +* Bug fixes + * Various small fixes for Megatron-FSDP. [#2346](https://github.com/NVIDIA/Megatron-LM/pull/2346) + * [Megatron-FSDP] Support both old and new DeviceMesh APIs. [#2575](https://github.com/NVIDIA/Megatron-LM/pull/2575) + * [Megatron-FSDP] Build default FSDP DeviceMesh, and remove model arg from `fully_shard_optimizer()`. [#2471](https://github.com/NVIDIA/Megatron-LM/pull/2471) + +## NVIDIA Megatron Core 0.15.1 + +Yanked release. + ## NVIDIA Megatron Core 0.15.0 * Features From 4376cc5d889676e56167f9fdd864a870dc5f8902 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 6 Feb 2026 14:44:14 -0600 Subject: [PATCH 074/231] ci: Set throughput tests as flaky (#3301) Signed-off-by: Charlie Truong --- tests/test_utils/recipes/h100/gpt-grpo.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_utils/recipes/h100/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml index 500c19b4f6e..cd51c07600f 100644 --- a/tests/test_utils/recipes/h100/gpt-grpo.yaml +++ b/tests/test_utils/recipes/h100/gpt-grpo.yaml @@ -57,12 +57,12 @@ products: - test_case: [gpt_grpo_tp4_pp1_dp2_8b_throughput] products: - environment: [dev] - scope: [mr] + scope: [flaky] platforms: [dgx_h100] - test_case: [gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput] products: - environment: [dev] - scope: [mr] + scope: [flaky] platforms: [dgx_h100] - test_case: [gpt_grpo_tp4_pp1_dp2_8b_throughput_github] products: From f92460bff3cf0e0a1fa68b0b2b2739c348515edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 6 Feb 2026 23:09:00 +0100 Subject: [PATCH 075/231] chore: Move GB200 tests to nightly (#3302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/test_utils/recipes/gb200/gpt.yaml | 98 ++++++++++++------------- tests/test_utils/recipes/gb200/moe.yaml | 30 ++++---- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/tests/test_utils/recipes/gb200/gpt.yaml b/tests/test_utils/recipes/gb200/gpt.yaml index f387fbb9a13..270bda14a17 100644 --- a/tests/test_utils/recipes/gb200/gpt.yaml +++ b/tests/test_utils/recipes/gb200/gpt.yaml @@ -107,12 +107,12 @@ products: - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: @@ -122,146 +122,146 @@ products: - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # Hangs: #513 - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_gdn] products: @@ -271,62 +271,62 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_mla] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_tp2_pp2_uninstall_te] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_7b_tp1_pp4_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_7b_tp4_pp1_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] ####################################################################### # Super important mr, mr-github tests that run for DEV per mr, mr-github # @@ -344,7 +344,7 @@ products: - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: @@ -354,27 +354,27 @@ products: - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_a100, dgx_gb200] # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] # products: diff --git a/tests/test_utils/recipes/gb200/moe.yaml b/tests/test_utils/recipes/gb200/moe.yaml index 28ae2415aac..bfa760caa29 100644 --- a/tests/test_utils/recipes/gb200/moe.yaml +++ b/tests/test_utils/recipes/gb200/moe.yaml @@ -110,12 +110,12 @@ products: - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # hang: #513 - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: @@ -125,57 +125,57 @@ products: # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # hang: #513 - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon] products: @@ -198,12 +198,12 @@ products: # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM] # products: # - environment: [dev] - # scope: [mr] + # scope: [nightly] # platforms: [dgx_gb200] ########################### # Merge train tests # From cfbe9b500074c0e5859ec3c6e72e1d5c60094e7a Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Fri, 6 Feb 2026 17:06:53 -0500 Subject: [PATCH 076/231] Ensure type-checker understands use of Submodules in bert_model (#3256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: oliver könig Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com> --- megatron/core/models/bert/bert_model.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py index abe9bc1c9b7..095d78d2c8b 100644 --- a/megatron/core/models/bert/bert_model.py +++ b/megatron/core/models/bert/bert_model.py @@ -15,6 +15,7 @@ from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.attention import SelfAttentionSubmodules from megatron.core.transformer.dot_product_attention import ( DotProductAttention as MCoreDotProductAttention, ) @@ -22,6 +23,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules from megatron.core.transformer.utils import get_linear_layer from megatron.core.utils import deprecate_inference_params from megatron.core.utils import get_te_version as _get_te_version @@ -187,6 +189,11 @@ def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str: """ attention_backend = self.config.attention_backend attn_mask_dimensions = None + assert isinstance(self.transformer_layer_spec.submodules, TransformerLayerSubmodules) + assert isinstance( + self.transformer_layer_spec.submodules.self_attention.submodules, + SelfAttentionSubmodules, + ) # For local layer spec we just use b1ss if ( self.transformer_layer_spec.submodules.self_attention.submodules.core_attention From a63d04582afd10e7cd317195258f179f84a1278a Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Fri, 6 Feb 2026 19:55:11 -0500 Subject: [PATCH 077/231] Override extra_repr instead of __repr__ (#3200) Co-authored-by: Chris Grimm Co-authored-by: Eric Harper Co-authored-by: Xin Yao Co-authored-by: Charlie Truong --- .../core/extensions/transformer_engine.py | 31 +++++++++++++------ megatron/core/tensor_parallel/layers.py | 21 +++++++++---- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index ef8527e9e5e..455111b8c03 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -15,6 +15,7 @@ from packaging.version import Version as PkgVersion from torch import Tensor from torch.nn.parameter import Parameter +from typing_extensions import override from megatron.core.dist_checkpointing.mapping import ShardedStateDict from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding @@ -919,10 +920,14 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): dp_cp_group=metadata["dp_cp_group"], ) - def __repr__(self): + @override + def extra_repr(self) -> str: + """Extra context to add to the module's string representation.""" return ( - f"{type(self).__name__}(in_features={self.in_features}, " - f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})" + f"in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"bias={self.use_bias}, " + f"TP={self.tp_size}" ) def backward_dw(self): @@ -1025,10 +1030,14 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): dp_cp_group=metadata["dp_cp_group"], ) - def __repr__(self): + @override + def extra_repr(self) -> str: + """Extra context to add to the module's string representation.""" return ( - f"{type(self).__name__}(in_features={self.in_features}, " - f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})" + f"in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"bias={self.use_bias}, " + f"TP={self.tp_size}" ) def backward_dw(self): @@ -1125,10 +1134,14 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): dp_cp_group=metadata["dp_cp_group"], ) - def __repr__(self): + @override + def extra_repr(self) -> str: + """Extra context to add to the module's string representation.""" return ( - f"{type(self).__name__}(in_features={self.in_features}, " - f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})" + f"in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"bias={self.use_bias}, " + f"TP={self.tp_size}" ) def backward_dw(self): diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 69f442eb2d4..77674e64519 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -11,6 +11,7 @@ import torch import torch.nn.functional as F from torch.nn.parameter import Parameter +from typing_extensions import override from megatron.core.model_parallel_config import ModelParallelConfig from megatron.core.parallel_state import ( @@ -1076,12 +1077,16 @@ def get_extra_state(self) -> None: """Keep compatibility with TE state dict.""" return None - def __repr__(self): + @override + def extra_repr(self) -> str: + """Extra context to add to the module's string representation.""" tp = self.output_size // self.output_size_per_partition use_bias = self.bias is not None and self.bias is True return ( - f"{type(self).__name__}(in_features={self.input_size}, " - f"out_features={self.output_size}, bias={use_bias}, TP={tp})" + f"in_features={self.input_size}, " + f"out_features={self.output_size}, " + f"bias={use_bias}, " + f"TP={tp}" ) @@ -1319,10 +1324,14 @@ def get_extra_state(self) -> None: """Keep compatibility with TE state dict.""" return None - def __repr__(self): + @override + def extra_repr(self) -> str: + """Extra context to add to the module's string representation.""" tp = self.input_size // self.input_size_per_partition use_bias = self.bias is not None and self.bias is True return ( - f"{type(self).__name__}(in_features={self.input_size}, " - f"out_features={self.output_size}, bias={use_bias}, TP={tp})" + f"in_features={self.input_size}, " + f"out_features={self.output_size}, " + f"bias={use_bias}, " + f"TP={tp}" ) From f68c7c10fa6367dac597ec8fa3944ed315a15492 Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Fri, 6 Feb 2026 20:50:06 -0500 Subject: [PATCH 078/231] Replace ModuleSpec with Protocols for LayerNorm submodules (#3090) Co-authored-by: Xin Yao Co-authored-by: Charlie Truong --- examples/multimodal/layer_specs.py | 4 +- examples/multimodal/nvlm/internvit.py | 21 +++---- examples/multimodal/radio/radio_g.py | 4 +- .../core/extensions/transformer_engine.py | 9 ++- .../transformer_engine_spec_provider.py | 3 +- megatron/core/models/T5/t5_spec.py | 2 +- megatron/core/models/backends.py | 8 +-- .../core/models/gpt/fine_grained_callables.py | 7 ++- .../core/post_training/modelopt/layers.py | 18 ++++-- megatron/core/ssm/mamba_layer.py | 16 ++++-- megatron/core/tensor_parallel/random.py | 31 ++++++++-- megatron/core/transformer/attention.py | 15 +++-- .../transformer/multi_latent_attention.py | 18 +++--- .../transformer/multi_token_prediction.py | 34 +++++------ megatron/core/transformer/torch_norm.py | 26 +++++++-- .../core/transformer/transformer_block.py | 11 ++-- .../core/transformer/transformer_layer.py | 57 +++++++++++-------- 17 files changed, 178 insertions(+), 106 deletions(-) diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py index 56821f2cec6..f341173c528 100644 --- a/examples/multimodal/layer_specs.py +++ b/examples/multimodal/layer_specs.py @@ -77,7 +77,7 @@ def get_layer_spec(is_vit, normalization) -> ModuleSpec: return ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( - input_layernorm=norm, + input_layernorm=not_none(norm), self_attention=ModuleSpec( module=SelfAttention, params={"attn_mask_type": attn_mask_type}, @@ -90,7 +90,7 @@ def get_layer_spec(is_vit, normalization) -> ModuleSpec: ), ), self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=norm, + pre_mlp_layernorm=not_none(norm), mlp=mlp, mlp_bda=get_bias_dropout_add, ), diff --git a/examples/multimodal/nvlm/internvit.py b/examples/multimodal/nvlm/internvit.py index 9df9af23f05..cb95129c02a 100644 --- a/examples/multimodal/nvlm/internvit.py +++ b/examples/multimodal/nvlm/internvit.py @@ -34,10 +34,11 @@ from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint +from megatron.core.typed_torch import not_none from megatron.core.utils import divide try: @@ -61,7 +62,7 @@ class InternViTRMSNorm(MegatronModule): def __init__( self, - config, + config: TransformerConfig, hidden_size: int, eps: float = 1e-6, sequence_parallel: bool = False, @@ -93,7 +94,7 @@ def _norm(self, x, var): return x * torch.rsqrt(var + self.eps) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: """Run RMSNorm with an option to compute custom statistic.""" var = None if self._compute_var: @@ -184,7 +185,7 @@ def __init__( self.config.hidden_size, self.query_projection_size + 2 * self.kv_projection_size, config=self.config, - init_method=self.config.init_method, + init_method=not_none(self.config.init_method), gather_output=False, bias=qkv_bias, skip_bias_add=False, @@ -196,20 +197,16 @@ def __init__( self.hidden_size_per_attention_head * self.num_attention_heads_per_partition ) # 512 for internvit - self.q_layernorm = build_module( - submodules.q_layernorm, + self.q_layernorm = not_none(submodules.q_layernorm)( hidden_size=qk_layernorm_hidden_size, config=self.config, eps=self.config.layernorm_epsilon, - compute_var=True, ) - self.k_layernorm = build_module( - submodules.k_layernorm, + self.k_layernorm = not_none(submodules.k_layernorm)( hidden_size=qk_layernorm_hidden_size, config=self.config, eps=self.config.layernorm_epsilon, - compute_var=True, ) @@ -249,8 +246,8 @@ def get_internvit_layer_spec(use_te) -> ModuleSpec: linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear, core_attention=TEDotProductAttention if use_te else DotProductAttention, linear_proj=TERowParallelLinear if use_te else RowParallelLinear, - q_layernorm=InternViTRMSNorm, - k_layernorm=InternViTRMSNorm, + q_layernorm=partial(InternViTRMSNorm, compute_var=True), + k_layernorm=partial(InternViTRMSNorm, compute_var=True), ), ), self_attn_bda=get_bias_dropout_add_layer_scaling, diff --git a/examples/multimodal/radio/radio_g.py b/examples/multimodal/radio/radio_g.py index f139632df86..e2e00b2c7de 100644 --- a/examples/multimodal/radio/radio_g.py +++ b/examples/multimodal/radio/radio_g.py @@ -93,7 +93,7 @@ def get_radio_g_layer_spec(normalization) -> ModuleSpec: return ModuleSpec( module=LayerScalingTransformerLayer, submodules=TransformerLayerSubmodules( - input_layernorm=norm, + input_layernorm=not_none(norm), self_attention=ModuleSpec( module=SelfAttention, params={"attn_mask_type": attn_mask_type}, @@ -106,7 +106,7 @@ def get_radio_g_layer_spec(normalization) -> ModuleSpec: ), ), self_attn_bda=get_bias_dropout_add_layer_scaling, - pre_mlp_layernorm=norm, + pre_mlp_layernorm=not_none(norm), mlp=mlp, mlp_bda=get_bias_dropout_add_layer_scaling, ), diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 455111b8c03..81664a91c0b 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -8,7 +8,7 @@ import pickle import warnings from contextlib import nullcontext -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, cast import torch import torch.nn.functional as F @@ -44,6 +44,7 @@ from megatron.core.tensor_parallel.utils import divide from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP +from megatron.core.transformer.torch_norm import LayerNormInterface from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ( ensure_metadata_has_dp_cp_group, @@ -436,7 +437,9 @@ class TENorm: Transformer-Engine's `LayerNorm` or `RMSNorm` based on input.""" # TODO should we ditch normalization config and just use spec to choose LayerNorm vs RMSNorm? - def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5): + def __new__( + cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5 + ) -> LayerNormInterface: if not HAVE_TE: raise ImportError( "Transformer Engine is not installed. " @@ -465,7 +468,7 @@ def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5) else: raise Exception("Only LayerNorm and RMSNorm are curently supported") - return instance + return cast(LayerNormInterface, instance) class TELinear(te.pytorch.Linear): diff --git a/megatron/core/extensions/transformer_engine_spec_provider.py b/megatron/core/extensions/transformer_engine_spec_provider.py index a071959bfc9..c960a90716c 100644 --- a/megatron/core/extensions/transformer_engine_spec_provider.py +++ b/megatron/core/extensions/transformer_engine_spec_provider.py @@ -19,6 +19,7 @@ from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.mlp import MLPSubmodules from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.utils import get_te_version, is_te_min_version @@ -45,7 +46,7 @@ def column_parallel_layer_norm_linear(self) -> Optional[type]: """Which module for sequential layernorm and linear""" return TELayerNormColumnParallelLinear - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> LayerNormBuilder: """Which module to use for layer norm""" if for_qk and not is_te_min_version("1.9.0"): # TENorm significantly harms convergence when used diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py index 50aecf0a950..0c0bd08de22 100644 --- a/megatron/core/models/T5/t5_spec.py +++ b/megatron/core/models/T5/t5_spec.py @@ -102,7 +102,7 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec: ), ), self_attn_bda=get_bias_dropout_add, - pre_cross_attn_layernorm=TENorm, + pre_cross_attn_layernorm=not_none(TENorm), cross_attention=ModuleSpec( module=CrossAttention, params={"attn_mask_type": AttnMaskType.padding}, diff --git a/megatron/core/models/backends.py b/megatron/core/models/backends.py index 7f84599a04c..21044486c7e 100644 --- a/megatron/core/models/backends.py +++ b/megatron/core/models/backends.py @@ -8,7 +8,7 @@ from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.mlp import MLPSubmodules from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP -from megatron.core.transformer.torch_norm import WrappedTorchNorm +from megatron.core.transformer.torch_norm import LayerNormBuilder, WrappedTorchNorm try: import apex # pylint: disable=unused-import @@ -60,7 +60,7 @@ def column_parallel_layer_norm_linear(self) -> Optional[type]: ... @abstractmethod - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> LayerNormBuilder: """Which module for layernorm""" ... @@ -101,7 +101,7 @@ def column_parallel_layer_norm_linear(self) -> Optional[type]: """Which module for sequential layernorm and linear""" return None - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> LayerNormBuilder: """Which module to use for layer norm""" if rms_norm: # Matching get_gpt_layer_local_spec. @@ -157,7 +157,7 @@ def column_parallel_layer_norm_linear(self) -> type[InferenceLayerNormColumnPara """Which module for sequential layernorm and linear""" return InferenceLayerNormColumnParallelLinear - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> LayerNormBuilder: """Which module to use for layer norm""" if for_qk and not is_te_min_version("1.9.0"): # TENorm significantly harms convergence when used diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index e17ed0a5d40..2cb77961054 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -22,6 +22,7 @@ get_mtp_layer_offset, ) from megatron.core.transformer.transformer_layer import TransformerLayer, make_viewless_tensor +from megatron.core.typed_torch import apply_module from megatron.core.utils import internal_api @@ -466,13 +467,15 @@ def forward_func( layer.offload_mlp_norm, hidden_states, "mlp_norm" ) as hidden_states: pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states + apply_module(layer.pre_mlp_layernorm), hidden_states ) else: with off_interface( layer.offload_mlp_norm, hidden_states, "mlp_norm" ) as hidden_states: - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + pre_mlp_layernorm_output = apply_module(layer.pre_mlp_layernorm)( + hidden_states + ) shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) diff --git a/megatron/core/post_training/modelopt/layers.py b/megatron/core/post_training/modelopt/layers.py index 0045f6ea9f4..45aabf3db66 100644 --- a/megatron/core/post_training/modelopt/layers.py +++ b/megatron/core/post_training/modelopt/layers.py @@ -1,12 +1,14 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +from __future__ import annotations import logging -from typing import Callable, List, Optional +from typing import TYPE_CHECKING, Callable, List, Optional, cast import torch from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.transformer.torch_norm import LayerNormInterface from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint @@ -18,7 +20,13 @@ HAVE_TE = True except ImportError: - HAVE_TE = False + if TYPE_CHECKING: + # Unambiguously treat transformer_engine as available during type checking. + import transformer_engine as te # type: ignore[import] + + HAVE_TE = True + else: + HAVE_TE = False logger = logging.getLogger(__name__) @@ -53,7 +61,9 @@ class Norm: mismatch issue. """ - def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5): + def __new__( + cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5 + ) -> LayerNormInterface: if not HAVE_TE: raise ImportError( "Transformer-Engine is not installed, please install it with " @@ -95,7 +105,7 @@ def _load_state_dict_pre_hook( instance._register_state_dict_hook(_state_dict_hook) instance._register_load_state_dict_pre_hook(_load_state_dict_pre_hook) - return instance + return cast(LayerNormInterface, instance) class Linear(torch.nn.Linear): diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index bc5ad42d005..8c2f59369a5 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -6,7 +6,7 @@ # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Protocol, Tuple, Union import torch from torch import Tensor @@ -20,10 +20,18 @@ from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import GraphableMegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormInterface from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.typed_torch import apply_module from megatron.core.utils import deprecate_inference_params +class LayerNormBuilder(Protocol): + """A protocol showing how MambaLayer expects to construct its LayerNorm.""" + + def __call__(self, config: TransformerConfig, hidden_size: int, /) -> LayerNormInterface: ... + + @dataclass class MambaLayerSubmodules: """ @@ -40,7 +48,7 @@ class MambaLayerSubmodules: after the mixer. """ - norm: Union[ModuleSpec, type] = IdentityOp + norm: LayerNormBuilder = IdentityOp mixer: Union[ModuleSpec, type] = IdentityOp mamba_bda: Union[ModuleSpec, type] = IdentityOp @@ -82,7 +90,7 @@ def __init__( pg_collection=pg_collection, pp_layer_offset=pp_layer_offset, ) - self.norm = build_module(submodules.norm, self.config, self.config.hidden_size) + self.norm = submodules.norm(self.config, self.config.hidden_size) self.mamba_bda = build_module(submodules.mamba_bda) self.bias_dropout_add_exec_handler = torch.enable_grad @@ -132,7 +140,7 @@ def forward( residual = residual.to(torch.float32) hidden_states = hidden_states.to(dtype=self.config.params_dtype) - hidden_states = self.norm(hidden_states) + hidden_states = apply_module(self.norm)(hidden_states) mixer_out_with_bias = self.mixer( hidden_states, inference_context=inference_context, packed_seq_params=packed_seq_params diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index bf00717ab6c..2ba5af0962b 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -2,16 +2,19 @@ # Parts of the code here are adapted from PyTorch # repo: https://github.com/pytorch/pytorch +from __future__ import annotations import contextlib import logging -from typing import Optional, Union +from collections.abc import Callable +from typing import Any, Optional, TypeVar, Union import torch from torch import _C from torch.cuda import _lazy_call, _lazy_init from torch.cuda import device as device_ctx_manager from torch.utils.checkpoint import detach_variable +from typing_extensions import TypeVarTuple, Unpack from megatron.core.parallel_state import ( get_expert_model_parallel_rank, @@ -493,6 +496,10 @@ def is_checkpointing(): return IS_CHECKPOINTING +_R = TypeVar('_R') +_Ts = TypeVarTuple('_Ts') + + class CheckpointFunction(torch.autograd.Function): """Checkpoint Function @@ -503,7 +510,12 @@ class CheckpointFunction(torch.autograd.Function): # pylint: disable=missing-function-docstring @staticmethod - def forward(ctx, run_function, distribute_saved_activations, *args): + def forward( + ctx: Any, + run_function: Callable[[Unpack[_Ts]], _R], + distribute_saved_activations: bool, + *args: Unpack[_Ts], + ) -> _R: """Forward pass.""" _set_checkpointing() @@ -570,7 +582,9 @@ def backward(ctx, *args): return (None, None) + grads -def checkpoint(function, distribute_saved_activations, *args): +def checkpoint( + function: Callable[[Unpack[_Ts]], _R], distribute_saved_activations: bool, *args: Unpack[_Ts] +) -> _R: """Checkpoint a model or part of the model. This has been directly copied from torch.utils.checkpoint.""" return CheckpointFunction.apply(function, distribute_saved_activations, *args) @@ -578,12 +592,17 @@ def checkpoint(function, distribute_saved_activations, *args): class CheckpointWithoutOutputFunction(torch.autograd.Function): """ - Checkpoint Function Helper for CheckpointWithouOutput. + Checkpoint Function Helper for CheckpointWithoutOutput. Save context for recompute. """ @staticmethod - def forward(ctx, run_function, checkpoint_without_output_obj, *args): + def forward( + ctx: Any, + run_function: Callable[[Unpack[_Ts]], _R], + checkpoint_without_output_obj: CheckpointWithoutOutput, + *args: Unpack[_Ts], + ) -> _R: """Forward pass.""" if checkpoint_without_output_obj.fp8: fp8 = FP8GlobalStateManager.is_fp8_enabled() @@ -641,7 +660,7 @@ def __init__(self, fp8=False): self.ctx = None self.outputs = None - def checkpoint(self, run_function, *args): + def checkpoint(self, run_function: Callable[[Unpack[_Ts]], _R], *args: Unpack[_Ts]) -> _R: """Checkpoint function.""" # If in cuda graph warmup, disable checkpointing, as 'discard_output_and_register_recompute' diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 2200b558225..019c6fef396 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -34,6 +34,7 @@ from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.typed_torch import apply_module, not_none from megatron.core.utils import ( deprecate_inference_params, @@ -219,8 +220,8 @@ class SelfAttentionSubmodules: linear_qkv: LinearQkvBuilder core_attention: CoreAttentionBuilder linear_proj: Union[ModuleSpec, type] = None - q_layernorm: Union[ModuleSpec, type] = None - k_layernorm: Union[ModuleSpec, type] = None + q_layernorm: LayerNormBuilder | None = None + k_layernorm: LayerNormBuilder | None = None @dataclass @@ -1278,8 +1279,7 @@ def __init__( ) if submodules.q_layernorm is not None: - self.q_layernorm = build_module( - submodules.q_layernorm, + self.q_layernorm = submodules.q_layernorm( hidden_size=self.hidden_size_per_attention_head, config=self.config, eps=self.config.layernorm_epsilon, @@ -1288,8 +1288,7 @@ def __init__( self.q_layernorm = None if submodules.k_layernorm is not None: - self.k_layernorm = build_module( - submodules.k_layernorm, + self.k_layernorm = submodules.k_layernorm( hidden_size=self.hidden_size_per_attention_head, config=self.config, eps=self.config.layernorm_epsilon, @@ -1475,10 +1474,10 @@ def get_query_key_value_tensors( query = query[:, :, idx * size : (idx + 1) * size, :] if self.q_layernorm is not None: - query = self.q_layernorm(query) + query = apply_module(self.q_layernorm)(query) if self.k_layernorm is not None: - key = self.k_layernorm(key) + key = apply_module(self.k_layernorm)(key) if self.config.test_mode: self.run_realtime_tests() diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index cd3db50a35b..4783397d68f 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -38,7 +38,9 @@ ) from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.transformer.transformer_config import MLATransformerConfig +from megatron.core.typed_torch import apply_module from megatron.core.utils import deprecate_inference_params, get_pg_size, is_te_min_version try: @@ -69,6 +71,10 @@ class MLASelfAttentionSubmodules: """Submodules for the MLA self-attention layer.""" + # TODO(nschank): Move layernorms back to the bottom once all other layers have defaults removed. + q_layernorm: LayerNormBuilder + kv_layernorm: LayerNormBuilder + linear_q_proj: Union[ModuleSpec, type] = None linear_q_down_proj: Union[ModuleSpec, type] = None linear_q_up_proj: Union[ModuleSpec, type] = None @@ -76,8 +82,6 @@ class MLASelfAttentionSubmodules: linear_kv_up_proj: Union[ModuleSpec, type] = None core_attention: Union[ModuleSpec, type] = None linear_proj: Union[ModuleSpec, type] = None - q_layernorm: Union[ModuleSpec, type] = None - kv_layernorm: Union[ModuleSpec, type] = None class MultiLatentAttention(Attention): @@ -495,15 +499,13 @@ def __init__( ) if self.config.q_lora_rank is not None: - self.q_layernorm = build_module( - submodules.q_layernorm, + self.q_layernorm = submodules.q_layernorm( hidden_size=self.config.q_lora_rank, config=self.config, eps=self.config.layernorm_epsilon, ) - self.kv_layernorm = build_module( - submodules.kv_layernorm, + self.kv_layernorm = submodules.kv_layernorm( hidden_size=self.config.kv_lora_rank, config=self.config, eps=self.config.layernorm_epsilon, @@ -635,9 +637,9 @@ def get_query_key_value_tensors( if self.config.q_lora_rank is not None: # q_compressed: [num_tokens, q_lora_rank] - q_compressed = self.q_layernorm(q_compressed) + q_compressed = apply_module(self.q_layernorm)(q_compressed) - kv_compressed = self.kv_layernorm(kv_compressed) + kv_compressed = apply_module(self.kv_layernorm)(kv_compressed) # ========================================= # QKV up projection and RoPE apply diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 1c431491ca2..7b670f1365e 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -23,8 +23,10 @@ from megatron.core.transformer.enums import AttnMaskType, LayerType from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.transformer.transformer_block import TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.typed_torch import apply_module from megatron.core.utils import ( get_pg_rank, is_torch_min_version, @@ -409,21 +411,22 @@ class MultiTokenPredictionLayerSubmodules: Dataclass for specifying the submodules of a MultiTokenPrediction module. Args: - hnorm (Union[ModuleSpec, type]): Specification or instance of the - hidden states normalization to be applied. - enorm (Union[ModuleSpec, type]): Specification or instance of the - embedding normalization to be applied. + hnorm: Specification or instance of the hidden states normalization to be applied. + enorm: Specification or instance of the embedding normalization to be applied. eh_proj (Union[ModuleSpec, type]): Specification or instance of the linear projection to be applied. mtp_model_layer (Union[ModuleSpec, type]): Specification or instance of the transformer or mamba block to be applied. """ - enorm: Union[ModuleSpec, type] = None - hnorm: Union[ModuleSpec, type] = None + enorm: LayerNormBuilder + hnorm: LayerNormBuilder + # TODO(nschank): Move this back below transformer_layer once eh_proj and transformer_layer have + # their defaults removed. + layer_norm: LayerNormBuilder + eh_proj: Union[ModuleSpec, type] = None mtp_model_layer: Union[ModuleSpec, type] = None - layer_norm: Union[ModuleSpec, type] = None def get_mtp_layer_spec( @@ -449,7 +452,7 @@ def get_mtp_layer_spec_for_backend( ModuleSpec: Module specification with modules from the backend. """ column_parallel_linear_impl: type = backend.column_parallel_linear() - layer_norm_impl: type = backend.layer_norm() + layer_norm_impl = backend.layer_norm() mtp_layer_spec = ModuleSpec( module=MultiTokenPredictionLayer, submodules=MultiTokenPredictionLayerSubmodules( @@ -741,15 +744,13 @@ def __init__( f"The supported attention mask types are {SUPPORTED_ATTN_MASK}." ) - self.enorm = build_module( - self.submodules.enorm, + self.enorm = self.submodules.enorm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, ) - self.hnorm = build_module( - self.submodules.hnorm, + self.hnorm = self.submodules.hnorm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -802,8 +803,7 @@ def __init__( is_mtp_layer=True, ) - self.final_layernorm = build_module( - self.submodules.layer_norm, + self.final_layernorm = self.submodules.layer_norm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -859,9 +859,9 @@ def _concat_embeddings(self, hidden_states: torch.Tensor, decoder_input: torch.T """ Concatenate the tokens before sending to transformer layer. """ - decoder_input = self.enorm(decoder_input) + decoder_input = apply_module(self.enorm)(decoder_input) decoder_input = make_viewless_tensor(inp=decoder_input, requires_grad=True, keep_graph=True) - hidden_states = self.hnorm(hidden_states) + hidden_states = apply_module(self.hnorm)(hidden_states) hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True) # At the (k - 1)-th MTP module, concatenates the i-th token's hidden_states # and the (i + K)-th token's embedding, and combine them with linear projection. @@ -953,7 +953,7 @@ def _postprocess(self, hidden_states: torch.Tensor): """ # Layer norm before shared head layer. - hidden_states = self.final_layernorm(hidden_states) + hidden_states = apply_module(self.final_layernorm)(hidden_states) # TENorm produces a "viewed" tensor. This will result in schedule.py's # deallocate_output_tensor() throwing an error, so a viewless tensor is # created to prevent this. diff --git a/megatron/core/transformer/torch_norm.py b/megatron/core/transformer/torch_norm.py index d0ceca7af41..5948ae600f9 100644 --- a/megatron/core/transformer/torch_norm.py +++ b/megatron/core/transformer/torch_norm.py @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from typing import Protocol + import torch from megatron.core.jit import jit_fuser @@ -6,6 +8,22 @@ from megatron.core.utils import is_torch_min_version +class LayerNormInterface(Protocol): + """Interface that all LayerNorm implementations should follow.""" + + def forward(self, x: torch.Tensor, /) -> torch.Tensor: + """Forward method for a LayerNorm implementation.""" + ... + + +class LayerNormBuilder(Protocol): + """A protocol showing how Modules are expected to construct LayerNorms.""" + + def __call__( + self, *, config: TransformerConfig, hidden_size: int, eps: float + ) -> LayerNormInterface: ... + + class WrappedTorchNorm: """ A conditional wrapper to initialize an instance of PyTorch's @@ -22,7 +40,7 @@ def __new__( persist_layer_norm: bool = False, zero_centered_gamma: bool = False, normalization: str = "LayerNorm", - ): + ) -> LayerNormInterface: assert ( not config.layernorm_zero_centered_gamma ), f"zero_centered_gamma not supported by torch LayerNorm" @@ -51,7 +69,7 @@ def __new__( return norm_cls(normalized_shape=hidden_size, eps=eps) -class L2Norm(torch.nn.Module): +class L2Norm(torch.nn.Module, LayerNormInterface): """ Applies L2 normalization to the input tensor along the last dimension. @@ -70,7 +88,7 @@ def __init__(self, hidden_size: int, eps: float = 1e-6, **kwargs): self.eps = eps @jit_fuser - def _norm(self, x): + def _norm(self, x: torch.Tensor) -> torch.Tensor: """ Performs the actual L2 normalization. @@ -83,7 +101,7 @@ def _norm(self, x): x_float = x.float() return (x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)).type_as(x) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: """ Forward pass of the L2Norm module. diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 73a724c3e91..7988763916c 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -2,7 +2,7 @@ import logging from contextlib import nullcontext from dataclasses import dataclass -from typing import List, Optional, Union +from typing import List, Optional, Union, cast import torch from torch import Tensor @@ -21,12 +21,14 @@ from megatron.core.transformer.enums import CudaGraphScope, LayerType from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import ( BaseTransformerLayer, get_transformer_layer_offset, ) from megatron.core.transformer.utils import sharded_state_dict_default +from megatron.core.typed_torch import apply_module, not_none from megatron.core.utils import ( WrappedTensor, deprecate_inference_params, @@ -219,7 +221,7 @@ class TransformerBlockSubmodules: """ layer_specs: Optional[List[ModuleSpec]] = None - layer_norm: Optional[Union[ModuleSpec, torch.nn.Module]] = None + layer_norm: LayerNormBuilder | None = None def _get_block_submodules( @@ -375,8 +377,7 @@ def build_layer(layer_spec, layer_number): # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline # self.post_process and self.post_layer_norm guide this behavior if self.has_final_layernorm_in_this_stage(): - self.final_layernorm = build_module( - self.submodules.layer_norm, + self.final_layernorm = not_none(self.submodules.layer_norm)( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -778,7 +779,7 @@ def forward( # Final layer norm. if self.final_layernorm is not None: - hidden_states = self.final_layernorm(hidden_states) + hidden_states = apply_module(self.final_layernorm)(cast(Tensor, hidden_states)) # TENorm produces a "viewed" tensor. This will result in schedule.py's # deallocate_output_tensor() throwing an error, so a viewless tensor is # created to prevent this. diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 855f8fe48ae..c9cf57a4eb0 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -1,11 +1,12 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from __future__ import annotations import functools import logging import warnings from abc import ABC from dataclasses import dataclass, field -from typing import Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Union import torch import torch.distributed @@ -22,7 +23,9 @@ from megatron.core.transformer.mlp import MLP from megatron.core.transformer.module import GraphableMegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.torch_norm import LayerNormBuilder from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.typed_torch import apply_module from megatron.core.utils import ( deprecate_inference_params, get_pg_rank, @@ -33,6 +36,9 @@ nvtx_range_push, ) +if TYPE_CHECKING: + from megatron.core.inference.contexts import BaseInferenceContext + logger = logging.getLogger(__name__) @@ -203,16 +209,16 @@ class TransformerLayerSubmodules: of the layer's architecture. Args: - input_layernorm (Union[ModuleSpec, type]): Specification for the input layer normalization. + input_layernorm: Specification for the input layer normalization. self_attention (Union[ModuleSpec, type]): Specification for the self-attention mechanism. self_attn_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation after self-attention. - pre_cross_attn_layernorm (Union[ModuleSpec, type]): Specification for the layer + pre_cross_attn_layernorm: Specification for the layer normalization before cross-attention. cross_attention (Union[ModuleSpec, type]): Specification for the cross-attention mechanism. cross_attn_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation after cross-attention. - pre_mlp_layernorm (Union[ModuleSpec, type]): Specification for the layer normalization + pre_mlp_layernorm: Specification for the layer normalization before the MLP. mlp (Union[ModuleSpec, type]): Specification for the MLP in Dense layer. mlp_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation @@ -221,15 +227,15 @@ class TransformerLayerSubmodules: in the `sharded_state_dict` method. """ - input_layernorm: Union[ModuleSpec, type] = IdentityOp + input_layernorm: LayerNormBuilder = IdentityOp self_attention: Union[ModuleSpec, type] = IdentityOp self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp - pre_cross_attn_layernorm: Union[ModuleSpec, type] = IdentityOp + pre_cross_attn_layernorm: LayerNormBuilder = IdentityOp cross_attention: Union[ModuleSpec, type] = IdentityOp cross_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp - pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp + pre_mlp_layernorm: LayerNormBuilder = IdentityOp mlp: Union[ModuleSpec, type] = IdentityOp mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp @@ -293,8 +299,7 @@ def __init__( # [Module 1: Input Layernorm] Optional Layernorm on the input data # TODO: add pytorch only layernorm - self.input_layernorm = build_module( - submodules.input_layernorm, + self.input_layernorm = submodules.input_layernorm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -321,8 +326,7 @@ def __init__( self.self_attn_bda = build_module(submodules.self_attn_bda) # [Module 4: Post SelfAttention] Optional Layernorm after self-attn - self.pre_cross_attn_layernorm = build_module( - submodules.pre_cross_attn_layernorm, + self.pre_cross_attn_layernorm = submodules.pre_cross_attn_layernorm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -340,8 +344,7 @@ def __init__( self.cross_attn_bda = build_module(submodules.cross_attn_bda, config=self.config) # [Module 7: Pre MLP] Optional Layernorm before MLP - self.pre_mlp_layernorm = build_module( - submodules.pre_mlp_layernorm, + self.pre_mlp_layernorm = submodules.pre_mlp_layernorm( config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon, @@ -392,6 +395,7 @@ def __init__( self.recompute_pre_mlp_layernorm = False self.recompute_mlp = False if self.config.recompute_granularity == 'selective': + assert self.config.recompute_modules is not None if "layernorm" in self.config.recompute_modules: if not isinstance(self.input_layernorm, IdentityOp): self.recompute_input_layernorm = True @@ -537,7 +541,7 @@ def _forward_attention( rotary_pos_sin: Optional[Tensor] = None, rotary_pos_cos_sin: Optional[Tensor] = None, attention_bias: Optional[Tensor] = None, - inference_context: Optional[Any] = None, + inference_context: Optional[BaseInferenceContext] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, padding_mask: Optional[Tensor] = None, @@ -585,11 +589,11 @@ def _forward_attention( self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput() with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states: input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( - self.input_layernorm, hidden_states + apply_module(self.input_layernorm), hidden_states ) else: with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states: - input_layernorm_output = self.input_layernorm(hidden_states) + input_layernorm_output = apply_module(self.input_layernorm)(hidden_states) using_fused_tp_inference_kernel = (not self.training) and ( self.config.inference_fuse_tp_communication @@ -649,7 +653,7 @@ def _forward_attention( residual = hidden_states # Optional Layer norm after self-attention - pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states) + pre_cross_attn_layernorm_output = apply_module(self.pre_cross_attn_layernorm)(hidden_states) # Cross attention. attention_output_with_bias = self.cross_attention( @@ -671,7 +675,7 @@ def _forward_attention( return hidden_states, context - def _forward_pre_mlp_layernorm(self, hidden_states): + def _forward_pre_mlp_layernorm(self, hidden_states: Tensor): from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( FineGrainedActivationOffloadingInterface as off_interface, ) @@ -680,15 +684,20 @@ def _forward_pre_mlp_layernorm(self, hidden_states): self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states: pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( - self.pre_mlp_layernorm, hidden_states + apply_module(self.pre_mlp_layernorm), hidden_states ) else: with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states: - pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) + pre_mlp_layernorm_output = apply_module(self.pre_mlp_layernorm)(hidden_states) return pre_mlp_layernorm_output - def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None): + def _forward_mlp( + self, + hidden_states: Tensor, + inference_context: BaseInferenceContext | None = None, + padding_mask: Tensor | None = None, + ) -> Tensor | list[Tensor | None]: """ Perform a forward pass through the feed-forward layer. @@ -783,7 +792,9 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) else: return self._forward_post_mlp(mlp_output_with_bias, residual) - def _forward_post_mlp(self, mlp_output_with_bias, residual): + def _forward_post_mlp( + self, mlp_output_with_bias: tuple[Tensor, Tensor | None], residual: Tensor + ) -> Tensor: """ Perform operations after the MLP computation. @@ -1118,7 +1129,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): residual = cuda_graph_output.pop() if not self.is_moe_layer: return residual, None, None, None - hidden_states = self.pre_mlp_layernorm(residual) + hidden_states = apply_module(self.pre_mlp_layernorm)(residual) shared_expert_output = self.mlp.shared_experts_compute(hidden_states) probs, routing_map = self.mlp.route(hidden_states) hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map) From 2f99ee87b9b04ec4f0a964e702c2b3afe900b92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Feb 2026 09:55:16 +0000 Subject: [PATCH 079/231] chore: Remove gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../env_config.yaml | 5 - .../golden_values_dev_dgx_h100.json | 187 ------------------ .../model_config.yaml | 90 --------- tests/test_utils/recipes/h100/gpt-grpo.yaml | 5 - 4 files changed, 287 deletions(-) delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml deleted file mode 100644 index 329246987bf..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent - agent_args: - dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4" - split: "train" - weight: 1.0 diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json deleted file mode 100644 index 05bc35e362f..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,187 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 30, - "step_interval": 1, - "values": { - "1": 0.0, - "2": 0.0, - "3": 0.0, - "4": 0.04559, - "5": 0.0, - "6": 0.0523, - "7": 0.0, - "8": 0.0, - "9": 0.04887, - "10": 0.0, - "11": 0.0, - "12": 0.0, - "13": 0.0, - "14": 0.0, - "15": 0.0, - "16": 0.0, - "17": 0.04299, - "18": 0.0, - "19": 0.03797, - "20": 0.0, - "21": 0.0, - "22": 0.0, - "23": 0.0, - "24": 0.0, - "25": 0.0, - "26": 0.0, - "27": 0.0, - "28": 0.0, - "29": 0.0, - "30": 0.0 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 30, - "step_interval": 1, - "values": { - "1": 583687296.0, - "2": 583687296.0, - "3": 583687296.0, - "4": 31.0, - "5": 583687296.0, - "6": 12.0, - "7": 583687296.0, - "8": 583687296.0, - "9": 16.0, - "10": 583687296.0, - "11": 583687296.0, - "12": 583687296.0, - "13": 583687296.0, - "14": 583687296.0, - "15": 583687296.0, - "16": 583687296.0, - "17": 47.0, - "18": 583687296.0, - "19": 43.0, - "20": 583687296.0, - "21": 583687296.0, - "22": 583687296.0, - "23": 583687296.0, - "24": 583687296.0, - "25": 583687296.0, - "26": 583687296.0, - "27": 583687296.0, - "28": 583687296.0, - "29": 583687296.0, - "30": 583687296.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 30, - "step_interval": 1, - "values": { - "1": 48985034752.0, - "2": 48991363072.0, - "3": 48993005568.0, - "4": 48991928320.0, - "5": 48992874496.0, - "6": 48991891456.0, - "7": 48991338496.0, - "8": 48993873920.0, - "9": 48993124352.0, - "10": 48994115584.0, - "11": 48994050048.0, - "12": 48993181696.0, - "13": 48993918976.0, - "14": 48992014336.0, - "15": 48992256000.0, - "16": 48989933568.0, - "17": 48992645120.0, - "18": 48992890880.0, - "19": 48992821248.0, - "20": 48992821248.0, - "21": 48991612928.0, - "22": 48993181696.0, - "23": 48992821248.0, - "24": 48992821248.0, - "25": 48993931264.0, - "26": 48992022528.0, - "27": 48993173504.0, - "28": 48992821248.0, - "29": 48993935360.0, - "30": 48994017280.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 30, - "step_interval": 1, - "values": { - "1": 49104257024.0, - "2": 49953497088.0, - "3": 49955368960.0, - "4": 49955368960.0, - "5": 49955368960.0, - "6": 49955368960.0, - "7": 49955368960.0, - "8": 49955745792.0, - "9": 49955745792.0, - "10": 49957498880.0, - "11": 49957838848.0, - "12": 49957838848.0, - "13": 49957838848.0, - "14": 49957838848.0, - "15": 49957838848.0, - "16": 49957838848.0, - "17": 49957838848.0, - "18": 49957838848.0, - "19": 49957838848.0, - "20": 49957838848.0, - "21": 49957838848.0, - "22": 49957838848.0, - "23": 49957838848.0, - "24": 49957838848.0, - "25": 49957838848.0, - "26": 49957838848.0, - "27": 49957838848.0, - "28": 49957838848.0, - "29": 49957838848.0, - "30": 49957838848.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 30, - "step_interval": 1, - "values": { - "1": "nan", - "2": 54.85374, - "3": 4.04314, - "4": 3.83505, - "5": 4.00853, - "6": 3.71939, - "7": 3.66436, - "8": 4.07479, - "9": 3.90049, - "10": 4.34491, - "11": 3.98659, - "12": 3.90765, - "13": 4.12679, - "14": 3.75558, - "15": 3.72381, - "16": 3.45749, - "17": 3.73387, - "18": 3.71406, - "19": 3.75517, - "20": 3.94287, - "21": 3.88534, - "22": 3.86744, - "23": 3.87809, - "24": 3.86352, - "25": 3.87829, - "26": 3.76391, - "27": 3.76762, - "28": 3.96514, - "29": 3.92952, - "30": 3.87378 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml deleted file mode 100644 index b5788d64049..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - THROUGHPUT_START_STEP: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -TEST_TYPE: frozen-start -MODE: rl -MODEL_ARGS: - --tiktoken-pattern: v2 - --use-mcore-models: true - --tokenizer-type: TikTokenizer - --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json - --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ - --auto-detect-ckpt-format: true - --max-tokens-to-oom: 3600000 - --inference-max-seq-length: 1024 - --attention-backend: flash - --mock-data: true - --micro-batch-size: 1 - --no-load-optim: true - --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 0 - --distributed-backend: nccl - --log-interval: 1 - --log-progress: true - --transformer-impl: transformer_engine - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 4 - --ckpt-format: torch_dist - --bf16: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --num-layers: 24 - --hidden-size: 1152 - --num-attention-heads: 16 - --max-position-embeddings: 1024 - --seq-length: 1024 - --timing-log-option: minmax - --log-throughput: true - --no-create-attention-mask-in-dataloader: true - --straggler-minmax-count: 16 - --tensorboard-log-interval: 1 - --empty-unused-memory-level: 2 - --langrl-inference-server-type: inplace_megatron - --seed: 42 - --calculate-per-token-loss: true - --rl-use-sequence-packing: true - --rl-sequence-packing-algo: fifo - --rl-offload-optimizer-during-inference: true - --timing-log-level: 1 - --log-timers-to-tensorboard: true - --cuda-graph-impl: local - --micro-batch-size: 1 - --global-batch-size: 16 - --grpo-group-size: 2 - --grpo-prompts-per-step: 8 - --grpo-iterations: 1 - --grpo-clamp-eps-lower: 0.2 - --grpo-clamp-eps-upper: 0.2 - --grpo-kl-beta: 0.0 - --grpo-entropy-term-weight: 0.0 - --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/env_config.yaml - --rl-partial-rollouts: true - --lr: 0.000001 - --lr-warmup-samples: 0 - --clip-grad: 1.0 - --use-checkpoint-args: true - --dist-ckpt-strictness: log_unexpected - --perform-rl-step: true - --train-samples: 48828125 - --exit-interval: 30 - --tensorboard-dir: ${TENSORBOARD_PATH} - --save-interval: 1000000 - --eval-interval: 1000000 - --finetune: true - --inference-logging-step-interval: 1 - --rl-inference-tensor-model-parallel-size: 1 - --rl-inference-pipeline-model-parallel-size: 2 - --refit-method: gloo - --deterministic-mode: true -METRICS: - - "iteration-time" - - "lm loss" - - "num-zeros" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" - diff --git a/tests/test_utils/recipes/h100/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml index cd51c07600f..b4b7a89b479 100644 --- a/tests/test_utils/recipes/h100/gpt-grpo.yaml +++ b/tests/test_utils/recipes/h100/gpt-grpo.yaml @@ -79,8 +79,3 @@ products: - environment: [dev] scope: [mr-github-broken] platforms: [dgx_h100] - - test_case: [gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] From e3ae6e492b79183760b002e3a27ccbd2f424745f Mon Sep 17 00:00:00 2001 From: wdykas <73254672+wdykas@users.noreply.github.com> Date: Sat, 7 Feb 2026 09:26:48 -0500 Subject: [PATCH 080/231] Non colocated refit (#3213) Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: William Dykas Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root Co-authored-by: root --- examples/rl/benchmark_refit.py | 345 ++++++++++++++++++ examples/rl/environments/math/math_agent.py | 2 +- .../core/extensions/transformer_engine.py | 2 + .../copy_services/nvshmem_copy_service.py | 9 +- megatron/core/resharding/execution.py | 19 +- .../planning/communication_scheduler.py | 98 ++++- .../nvshmem_copy_service/service.py | 20 + megatron/core/resharding/planner.py | 104 ++++-- megatron/core/resharding/refit.py | 181 +++++++-- megatron/core/resharding/utils.py | 128 ++++++- megatron/rl/parallel_utils.py | 15 +- megatron/rl/rl_utils.py | 29 +- megatron/training/training.py | 9 +- .../test_communication_scheduler.py | 210 +++++++++++ .../resharding/test_dp_balancing.py | 344 +++++++++++++++++ .../unit_tests/resharding/test_model_swap.py | 5 +- .../resharding/test_task_segmenter.py | 185 ++++++++++ .../resharding/test_workload_packer.py | 128 +++++++ 18 files changed, 1710 insertions(+), 123 deletions(-) create mode 100644 examples/rl/benchmark_refit.py create mode 100644 tests/unit_tests/resharding/test_communication_scheduler.py create mode 100644 tests/unit_tests/resharding/test_dp_balancing.py create mode 100644 tests/unit_tests/resharding/test_task_segmenter.py create mode 100644 tests/unit_tests/resharding/test_workload_packer.py diff --git a/examples/rl/benchmark_refit.py b/examples/rl/benchmark_refit.py new file mode 100644 index 00000000000..645cd9e2c15 --- /dev/null +++ b/examples/rl/benchmark_refit.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +""" +Benchmark script for model refit performance. + +Measures the time to transfer model weights between different parallelism configurations. +Supports both collocated (models share GPUs) and non-collocated (separate GPU sets) modes. +""" +import time + +import torch + +from megatron.core.resharding.refit import swap_model_weights +from megatron.training import get_args, get_model as get_training_model, print_rank_0 +from megatron.training.initialize import initialize_megatron +from megatron.training.arguments import core_transformer_config_from_args +from megatron.rl.parallel_utils import build_inference_pg_collection +from gpt_builders import gpt_builder +from megatron.core.resharding.copy_services.nvshmem_copy_service import NVSHMEMCopyService +from megatron.core.resharding.copy_services.nccl_copy_service import NCCLCopyService +from megatron.core.resharding.copy_services.gloo_copy_service import GlooCopyService + + +def add_benchmark_args(parser): + """Add benchmark-specific arguments.""" + group = parser.add_argument_group(title='refit benchmark') + + group.add_argument( + '--refit-mode', + type=str, + required=True, + choices=['collocated', 'non-collocated'], + help='Collocated: both models share GPUs. Non-collocated: separate GPU sets.' + ) + group.add_argument( + '--num-benchmark-warmup', + type=int, + default=2, + help='Number of warmup iterations (first builds refit plan).' + ) + group.add_argument( + '--num-benchmark-iterations', + type=int, + default=10, + help='Number of timed benchmark iterations.' + ) + + return parser + + +def model_provider(pre_process=True, post_process=True, parallel_output=False, + pg_collection=None, config=None): + """Build the model.""" + args = get_args() + if config is None: + config = core_transformer_config_from_args(args) + + return gpt_builder( + args=args, + pre_process=pre_process, + post_process=post_process, + config=config, + pg_collection=pg_collection, + ) + + +def create_refit_service(method): + """Create and return a refit service instance.""" + if method == 'nvshmem': + return NVSHMEMCopyService() + elif method == 'nccl': + return NCCLCopyService() + elif method == 'gloo': + return GlooCopyService() + else: + return method + + +def print_config_summary(args, src_config, dst_config, world_size, mode): + """Print benchmark configuration.""" + print_rank_0(f"\n{'='*80}") + print_rank_0(f"REFIT BENCHMARK - {mode.upper()} MODE") + print_rank_0(f"{'='*80}") + print_rank_0(f"World size: {world_size}") + print_rank_0(f"Source: TP={src_config['tp']}, PP={src_config['pp']}, EP={src_config['ep']}, DP={src_config['dp']}") + print_rank_0(f"Destination: TP={dst_config['tp']}, PP={dst_config['pp']}, EP={dst_config['ep']}, DP={dst_config['dp']}") + print_rank_0(f"Model: {args.num_layers}L, {args.hidden_size}H, {args.num_attention_heads} heads, vocab={args.vocab_size}") + if args.num_experts: + print_rank_0(f"MoE: {args.num_experts} experts, top-{args.moe_router_topk}") + print_rank_0(f"Backend: {args.refit_method}") + print_rank_0(f"{'='*80}\n") + + +def run_benchmark(src_model, dst_model, refit_service, num_warmup, num_iterations): + """Run warmup and benchmark iterations, return timings.""" + rank = torch.distributed.get_rank() + + # Warmup (builds refit plan on first iteration) + print_rank_0(f"Warmup: {num_warmup} iterations...") + for i in range(num_warmup): + torch.cuda.synchronize() + torch.distributed.barrier() + swap_model_weights(src_model, dst_model, refit_method=refit_service) + torch.cuda.synchronize() + torch.distributed.barrier() + + print_rank_0("Warmup complete. Starting benchmark...\n") + + # Benchmark iterations + print_rank_0(f"Benchmark: {num_iterations} iterations...") + timings = [] + + for i in range(num_iterations): + torch.cuda.synchronize() + torch.distributed.barrier() + + start_time = time.perf_counter() + swap_model_weights(src_model, dst_model, refit_method=refit_service) + torch.cuda.synchronize() + end_time = time.perf_counter() + + elapsed = end_time - start_time + timings.append(elapsed) + torch.distributed.barrier() + + return timings + + +def print_results(timings): + """Print benchmark results.""" + if torch.distributed.get_rank() == 0: + mean_time = sum(timings) / len(timings) + min_time = min(timings) + max_time = max(timings) + + print(f"\n{'='*80}") + print("RESULTS") + print(f"{'='*80}") + print(f"Mean: {mean_time*1000:.2f} ms") + print(f"Min: {min_time*1000:.2f} ms") + print(f"Max: {max_time*1000:.2f} ms") + print(f"{'='*80}\n") + + +def benchmark_collocated(): + """Benchmark refit in collocated mode (both models on same GPUs).""" + args = get_args() + world_size = torch.distributed.get_world_size() + + # Calculate parallelism + src_tp = args.tensor_model_parallel_size + src_pp = args.pipeline_model_parallel_size + src_ep = args.expert_model_parallel_size + src_world = src_tp * src_pp * src_ep + src_dp = world_size // src_world + + dst_tp = args.rl_inference_tensor_model_parallel_size or src_tp + dst_pp = args.rl_inference_pipeline_model_parallel_size or src_pp + dst_ep = args.rl_inference_expert_model_parallel_size or src_ep + dst_world = dst_tp * dst_pp * dst_ep + dst_dp = world_size // dst_world + + # Print config + src_config = {'tp': src_tp, 'pp': src_pp, 'ep': src_ep, 'dp': src_dp} + dst_config = {'tp': dst_tp, 'pp': dst_pp, 'ep': dst_ep, 'dp': dst_dp} + print_config_summary(args, src_config, dst_config, world_size, 'collocated') + + # Build source model + print_rank_0("Building source model...") + src_model = get_training_model( + lambda pre_process, post_process, **kwargs: model_provider( + pre_process=pre_process, post_process=post_process, parallel_output=False + ), + wrap_with_ddp=False + ) + src_model[0] = src_model[0].cuda() + + # Build destination model with custom parallelism + print_rank_0("Building destination model...") + dst_pg_collection = build_inference_pg_collection( + world_size, + tp_size=dst_tp, + pp_size=dst_pp, + ep_size=dst_ep, + expt_tp_size=args.rl_inference_expert_tensor_model_parallel_size, + use_tp_pp_dp_mapping=args.use_tp_pp_dp_mapping, + ) + + dst_config = core_transformer_config_from_args(args) + if args.num_experts: + dst_config.expert_model_parallel_size = dst_ep + dst_config.tensor_model_parallel_size = dst_tp + if args.rl_inference_expert_tensor_model_parallel_size: + dst_config.expert_tensor_parallel_size = args.rl_inference_expert_tensor_model_parallel_size + + dst_model = get_training_model( + lambda pre_process, post_process, **kwargs: model_provider( + pre_process=pre_process, post_process=post_process, + pg_collection=dst_pg_collection, config=dst_config + ), + wrap_with_ddp=False + ) + dst_model[0] = dst_model[0].cuda() + + torch.distributed.barrier() + + # Create refit service + print_rank_0(f"Creating {args.refit_method} service...") + refit_service = create_refit_service(args.refit_method) + print_rank_0("Service created.\n") + + # Run benchmark + timings = run_benchmark(src_model, dst_model, refit_service, + args.num_benchmark_warmup, args.num_benchmark_iterations) + + # Print results + print_results(timings) + + +def benchmark_non_collocated(): + """Benchmark refit in non-collocated mode (separate GPU sets).""" + args = get_args() + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + + # Calculate parallelism + src_tp = args.tensor_model_parallel_size + src_pp = args.pipeline_model_parallel_size + src_ep = args.expert_model_parallel_size + src_world = src_tp * src_pp * src_ep + + dst_tp = args.rl_inference_tensor_model_parallel_size or src_tp + dst_pp = args.rl_inference_pipeline_model_parallel_size or src_pp + dst_ep = args.rl_inference_expert_model_parallel_size or src_ep + dst_world = dst_tp * dst_pp * dst_ep + + required_size = src_world + dst_world + if world_size < required_size: + raise ValueError(f"Non-collocated requires {required_size} GPUs, got {world_size}") + + # Determine rank roles + is_src_rank = rank < src_world + is_dst_rank = src_world <= rank < required_size + is_idle_rank = rank >= required_size + + # Print config + src_config = {'tp': src_tp, 'pp': src_pp, 'ep': src_ep, 'dp': 1} + dst_config = {'tp': dst_tp, 'pp': dst_pp, 'ep': dst_ep, 'dp': 1} + print_config_summary(args, src_config, dst_config, world_size, 'non-collocated') + if world_size > required_size: + print_rank_0(f"Note: Ranks {required_size}-{world_size-1} are idle\n") + + # Create destination process groups (all ranks participate) + print_rank_0("Creating process groups...") + dst_pg_collection = build_inference_pg_collection( + world_size=dst_world, + tp_size=dst_tp, + pp_size=dst_pp, + ep_size=dst_ep, + expt_tp_size=args.rl_inference_expert_tensor_model_parallel_size, + use_tp_pp_dp_mapping=args.use_tp_pp_dp_mapping, + rank_offset=src_world, + ) + torch.distributed.barrier() + + # Idle ranks participate in collectives but have no models + if is_idle_rank: + src_model = None + dst_model = None + elif is_src_rank: + # Build source model + print_rank_0("Building source model...") + src_model = get_training_model( + lambda pre_process, post_process, **kwargs: model_provider( + pre_process=pre_process, post_process=post_process, parallel_output=False + ), + wrap_with_ddp=False + ) + src_model[0] = src_model[0].cuda() + dst_model = None + else: # is_dst_rank + # Build destination model + print_rank_0("Building destination model...") + dst_config = core_transformer_config_from_args(args) + if args.num_experts: + dst_config.expert_model_parallel_size = dst_ep + dst_config.tensor_model_parallel_size = dst_tp + if args.rl_inference_expert_tensor_model_parallel_size: + dst_config.expert_tensor_parallel_size = args.rl_inference_expert_tensor_model_parallel_size + + dst_model = get_training_model( + lambda pre_process, post_process, **kwargs: model_provider( + pre_process=pre_process, post_process=post_process, + pg_collection=dst_pg_collection, config=dst_config + ), + wrap_with_ddp=False + ) + dst_model[0] = dst_model[0].cuda() + src_model = None + + torch.distributed.barrier() + + # Create refit service + print_rank_0(f"Creating {args.refit_method} service...") + refit_service = create_refit_service(args.refit_method) + print_rank_0("Service created.\n") + + # Run benchmark + timings = run_benchmark(src_model, dst_model, refit_service, + args.num_benchmark_warmup, args.num_benchmark_iterations) + + # Print results + print_results(timings) + + +def main(): + """Main benchmark function.""" + initialize_megatron( + extra_args_provider=add_benchmark_args, + args_defaults={ + 'tokenizer_type': 'NullTokenizer', + 'no_load_optim': True, + 'no_load_rng': True, + 'no_save_optim': True, + 'no_save_rng': True, + }, + ignore_unknown_args=False, + ) + + args = get_args() + + # Set default vocab size if not provided + if args.vocab_size is None: + args.vocab_size = 50257 + print_rank_0("Using default vocab_size=50257") + + # Run benchmark + if args.refit_mode == 'collocated': + benchmark_collocated() + else: + benchmark_non_collocated() + + +if __name__ == "__main__": + main() diff --git a/examples/rl/environments/math/math_agent.py b/examples/rl/environments/math/math_agent.py index 67feb3b4adb..bdf322561eb 100644 --- a/examples/rl/environments/math/math_agent.py +++ b/examples/rl/environments/math/math_agent.py @@ -61,7 +61,7 @@ def compute_score(self, response: str, golden: dict, golden_key: str = "answer") """ # Allow tags or \boxed{} tags (this is a bit of cheating in favor of deepseek distilled models I think) matched_format = None - end_tokens = ["<|end_of_text|>", "<|endoftext|>", ""] + end_tokens = ["<|end_of_text|>", "<|endoftext|>", "", "<|eot_id|>"] # Only an answer immediately followed by a known end token yields 1.0 reward. answer_tag_pattern = r'(.*?)' diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 81664a91c0b..555b60b45d0 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -653,6 +653,8 @@ def __init__( # Reduce the gradient further on the TP group since the weight is # duplicated across TP ranks setattr(param, "sequence_parallel", self.config.sequence_parallel) + # Mark as NOT tensor parallel since weight is duplicated + setattr(param, "tensor_model_parallel", False) tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) self._tp_group = tp_group diff --git a/megatron/core/resharding/copy_services/nvshmem_copy_service.py b/megatron/core/resharding/copy_services/nvshmem_copy_service.py index 8d231de5339..14d83a1864e 100644 --- a/megatron/core/resharding/copy_services/nvshmem_copy_service.py +++ b/megatron/core/resharding/copy_services/nvshmem_copy_service.py @@ -162,10 +162,11 @@ def run(self): self._local_recv_ops.clear() # 2) Execute remote schedule (if any remote sends/recvs were registered). - if not self._remote.send_requests and not self._remote.receive_requests: - logger.info("NVSHMEMCopyService: no remote requests; local copies complete") - return - + # NOTE: ALL ranks must call schedule() and run() because they contain collective + # operations that require all ranks to participate: + # - schedule() has dist.all_gather_object() (torch distributed collective) + # - run() has nvshmem.core.barrier_all() (nvshmem collective) + # This is critical for non-collocated refit where some ranks may have no work. logger.info("NVSHMEMCopyService: building NVSHMEM schedule and executing") self._remote.schedule() self._remote.run() diff --git a/megatron/core/resharding/execution.py b/megatron/core/resharding/execution.py index 6a7779406d0..7c7c4700754 100644 --- a/megatron/core/resharding/execution.py +++ b/megatron/core/resharding/execution.py @@ -23,14 +23,25 @@ def execute_reshard_plan( Execute a reshard plan (from centralized controller). A communication service must be provided to abstract transport. Expected service API: submit_send(tensor, dest_rank), submit_recv(tensor, src_rank), run(). + + Supports None for src_module and/or dst_module to allow ranks in non-collocated mode: + - src_module=None: Rank only receives data (destination-only) + - dst_module=None: Rank only sends data (source-only) + - Both provided: Rank participates in both send and recv (collocated mode) """ - src_params = {name: p for name, p in src_module.named_parameters(recurse=True)} - dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)} + # Extract parameters from models if present + src_params = {} + dst_params = {} + if src_module is not None: + src_params = {name: p for name, p in src_module.named_parameters(recurse=True)} + if dst_module is not None: + dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)} + submit_send_with_id = getattr(service, "submit_send_with_id", None) submit_recv_with_id = getattr(service, "submit_recv_with_id", None) - # Submit sends + # Submit sends (only if we have source model) for op in plan.send_ops: src_param = src_params.get(op.param_name) if src_param is not None: @@ -40,7 +51,7 @@ def execute_reshard_plan( else: service.submit_send(src_view, op.peer_rank) - # Submit recvs + # Submit recvs (only if we have destination model) recv_writebacks: List[Tuple[torch.Tensor, torch.nn.Parameter, tuple[slice, ...]]] = [] for op in plan.recv_ops: dst_param = dst_params.get(op.param_name) diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py index 0f299a84e40..842f2b9d300 100644 --- a/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py +++ b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py @@ -10,6 +10,7 @@ class CommunicationScheduler: """ Builds a conflict-free, iteration-based schedule for communication. Ensures that in any given iteration, a PE is not overloaded. + Uses greedy first-fit scheduling algorithm. """ def __init__(self): @@ -34,8 +35,8 @@ def build_schedule( all_batches = self._collect_all_batches(workloads, my_pe, n_pes) PELogger.debug(f"Collected {len(all_batches)} total batches globally") - # Step 2: Assign batches to iterations using conflict-free algorithm - PELogger.debug("Assigning batches to iterations...") + # Step 2: Assign batches to iterations using greedy conflict-free algorithm + PELogger.debug("Assigning batches to iterations using greedy conflict-free algorithm...") self._assign_iterations(all_batches) PELogger.info(f"Schedule built: {self.num_iterations} iterations") @@ -101,32 +102,89 @@ def _collect_all_batches( return global_batches def _assign_iterations(self, batches: List[ScheduledBatch]): + """ + Greedy first-fit scheduling algorithm. + + Assigns batches to iterations using simple greedy first-fit. + Processes batches in sorted order and assigns each to the first + available iteration with no conflicts. + """ self.num_iterations = 0 - batches.sort(key=lambda x: (x.src_pe, x.dest_pe, x.batch_index)) + + # Calculate degree (conflict count) for each batch + def calc_degree(batch: ScheduledBatch, all_batches: List[ScheduledBatch]) -> int: + """Count how many other batches conflict with this batch.""" + conflicts = 0 + batch_pes = {batch.src_pe, batch.dest_pe} + for other in all_batches: + if other is batch: + continue + other_pes = {other.src_pe, other.dest_pe} + # Conflict if they share any PE + if batch_pes & other_pes: + conflicts += 1 + return conflicts + + def has_conflict(batch: ScheduledBatch, iteration_state: Dict) -> bool: + """ + Check if a batch conflicts with an iteration's current PE usage. + + A batch conflicts if either its source or destination PE is already + being used (as sender or receiver) in the iteration. + + Args: + batch: The batch to check + iteration_state: Dict with 'src_pes' and 'dst_pes' sets + + Returns: + True if there's a conflict, False if the batch can be scheduled + """ + return ( + batch.src_pe in iteration_state['src_pes'] + or batch.src_pe in iteration_state['dst_pes'] + or batch.dest_pe in iteration_state['src_pes'] + or batch.dest_pe in iteration_state['dst_pes'] + ) + + # Sort batches: process batches with more potential conflicts first + # This heuristic (largest-degree-first) often produces better colorings + # Sort by degree (descending), then total_size (descending) for tie-breaking + batches.sort(key=lambda b: (-calc_degree(b, batches), -b.total_size)) + + # Track which PEs are busy (sending or receiving) in each iteration + # iteration -> {src_pes: set, dst_pes: set} + iteration_usage = [] for batch in batches: - iteration = 0 + # Find first iteration where this batch fits (no conflicts) assigned = False - while not assigned: - if not self._has_conflict(batch, iteration, batches): - batch.iteration = iteration - self.num_iterations = max(self.num_iterations, iteration + 1) + for iter_idx in range(len(iteration_usage)): + if not has_conflict(batch, iteration_usage[iter_idx]): + # No conflict - assign to this iteration + batch.iteration = iter_idx + iteration_usage[iter_idx]['src_pes'].add(batch.src_pe) + iteration_usage[iter_idx]['dst_pes'].add(batch.dest_pe) assigned = True PELogger.debug( f" Assigned batch ({batch.src_pe} → {batch.dest_pe}, " - f"idx={batch.batch_index}) to iteration {iteration}" + f"idx={batch.batch_index}) to iteration {iter_idx}" ) - else: - iteration += 1 - - def _has_conflict( - self, batch: ScheduledBatch, iteration: int, all_batches: List[ScheduledBatch] - ) -> bool: - for other in all_batches: - if other.iteration == iteration and other is not batch: - if other.src_pe == batch.src_pe or other.dest_pe == batch.dest_pe: - return True - return False + break + + if not assigned: + # Need a new iteration + new_iter = len(iteration_usage) + batch.iteration = new_iter + iteration_usage.append({'src_pes': {batch.src_pe}, 'dst_pes': {batch.dest_pe}}) + PELogger.debug( + f" Assigned batch ({batch.src_pe} → {batch.dest_pe}, " + f"idx={batch.batch_index}) to NEW iteration {new_iter}" + ) + + self.num_iterations = len(iteration_usage) + PELogger.info( + f"Greedy scheduling: {len(batches)} batches → {self.num_iterations} iterations" + ) def _exchange_workload_summaries( self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int diff --git a/megatron/core/resharding/nvshmem_copy_service/service.py b/megatron/core/resharding/nvshmem_copy_service/service.py index 631e63ae41b..c538fd64f5b 100644 --- a/megatron/core/resharding/nvshmem_copy_service/service.py +++ b/megatron/core/resharding/nvshmem_copy_service/service.py @@ -99,8 +99,19 @@ def init(self, log_level: str = "INFO") -> None: PELogger.init(self.my_pe, level=log_level) PELogger.info(f"Initializing RemoteCopyService on PE {self.my_pe}/{self.n_pes}") + # Barrier to ensure ALL PEs finish NVSHMEM init before ANY PE starts buffer allocation + # buffer_manager.allocate() calls bytetensor() which is a collective operation + # Without this barrier, early PEs call bytetensor() while late PEs + # are still in init() -> deadlock + nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream) + self.gpu_resources.send_stream.sync() # Ensure barrier completes on CPU + # Allocate double-buffered send/recv slots self.buffer_manager.allocate() + + # Barrier to ensure all PEs complete buffer allocation before proceeding + nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream) + PELogger.debug("Allocated double-buffered send/recv slots") # Load CUDA kernels @@ -128,6 +139,15 @@ def init(self, log_level: str = "INFO") -> None: self.gpu_resources.torch_unpack_stream, self.gpu_resources.torch_copy_stream, ) + + # Synchronize all NVSHMEM streams before returning + # This ensures all barrier operations complete and streams are idle + # Without this, subsequent torch.cuda.synchronize() may hang waiting for pending work + self.gpu_resources.send_stream.sync() + self.gpu_resources.pack_stream.sync() + self.gpu_resources.unpack_stream.sync() + self.gpu_resources.copy_stream.sync() + PELogger.info("Initialization complete") def register_send( diff --git a/megatron/core/resharding/planner.py b/megatron/core/resharding/planner.py index 31045fbfc01..f9d8752a26b 100644 --- a/megatron/core/resharding/planner.py +++ b/megatron/core/resharding/planner.py @@ -90,6 +90,7 @@ def _plan_multi_dim_lcm( d = descriptors[0] if my_global_rank not in d.dst_dim_ranks: return [] + src_shape = tuple(src_metadata.shape) dst_shape = tuple(dst_metadata.shape) dim = d.dim @@ -174,8 +175,8 @@ def _finalize_dp_transfers( dst_shape = dst_metadata.shape - # Same DP layout - local copy - if src_dp_ranks == dst_dp_ranks: + # Same DP layout - local copy (only if this rank has the source parameter) + if src_dp_ranks == dst_dp_ranks and my_global_rank in src_dp_ranks: full_slice = tuple(slice(None) for _ in range(len(dst_shape))) return [(my_global_rank, full_slice, full_slice)] @@ -183,7 +184,17 @@ def _finalize_dp_transfers( # better load balancing across source ranks. This ensures that destination # ranks are distributed across source ranks even when they have the same # position within their respective DP groups. + # + # In non-collocated mode, src_dp_ranks might include ranks that don't + # have the source model (e.g., idle ranks or destination ranks). Filter to only + # include the rank that provided this metadata (src_metadata.owner_rank). + # src_metadata was selected by select_src_metadata_balanced, so owner_rank is the + # actual source rank for this parameter. + actual_src_rank = src_metadata.owner_rank src_global_rank = src_dp_ranks[my_global_rank % len(src_dp_ranks)] + # Override with the actual source rank if the selected rank doesn't have the parameter + if src_global_rank != actual_src_rank: + src_global_rank = actual_src_rank full_slice = tuple(slice(None) for _ in range(len(dst_shape))) return [(src_global_rank, full_slice, full_slice)] @@ -215,46 +226,63 @@ def build_centralized_reshard_plan( ) -> ReshardPlan: """ Centralized planning: Rank 0 builds complete plan for all ranks, then scatters. + + Supports None for src_module and/or dst_module to enable non-collocated mode: + - src_module=None: Rank doesn't have source model (destination-only) + - dst_module=None: Rank doesn't have destination model (source-only) + - Both provided: Rank has both models (collocated mode) + + Each rank provides metadata only for the models it owns, including parallel group + membership (tensor_parallel_group_ranks, expert_parallel_group_ranks, etc.). + This metadata is sufficient for rank 0 to build correct transfer plans without + requiring dummy models. """ my_global_rank = dist.get_rank() world_size = dist.get_world_size() - # Get process groups - src_pg = getattr(src_module, "pg_collection", None) - dst_pg = getattr(dst_module, "pg_collection", None) - if src_pg is None or dst_pg is None: - raise ValueError("Both modules must have pg_collection") - - # Gather param metadata from all ranks - my_src_params = {name: p for name, p in src_module.named_parameters(recurse=True)} - my_dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)} - - # Build PP layer prefix maps to be used for parameter name rewriting - src_layer_prefix_map = _build_layer_module_prefix_map(src_module) - dst_layer_prefix_map = _build_layer_module_prefix_map(dst_module) - - my_src_metadata = [ - extract_param_metadata( - p, - name, - my_global_rank, - src_pg, - num_experts=num_experts, - layer_module_prefix_map=src_layer_prefix_map, - ) - for name, p in my_src_params.items() - ] - my_dst_metadata = [ - extract_param_metadata( - p, - name, - my_global_rank, - dst_pg, - num_experts=num_experts, - layer_module_prefix_map=dst_layer_prefix_map, - ) - for name, p in my_dst_params.items() - ] + # Extract metadata from source model if present + if src_module is not None: + src_pg = getattr(src_module, "pg_collection", None) + if src_pg is None: + raise ValueError("Source module must have pg_collection") + my_src_params = {name: p for name, p in src_module.named_parameters(recurse=True)} + src_layer_prefix_map = _build_layer_module_prefix_map(src_module) + my_src_metadata = [ + extract_param_metadata( + p, + name, + my_global_rank, + src_pg, + num_experts=num_experts, + layer_module_prefix_map=src_layer_prefix_map, + ) + for name, p in my_src_params.items() + ] + else: + # No source model on this rank - provide empty metadata + my_src_metadata = [] + + # Extract metadata from destination model if present + if dst_module is not None: + dst_pg = getattr(dst_module, "pg_collection", None) + if dst_pg is None: + raise ValueError("Destination module must have pg_collection") + my_dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)} + dst_layer_prefix_map = _build_layer_module_prefix_map(dst_module) + my_dst_metadata = [ + extract_param_metadata( + p, + name, + my_global_rank, + dst_pg, + num_experts=num_experts, + layer_module_prefix_map=dst_layer_prefix_map, + ) + for name, p in my_dst_params.items() + ] + else: + # No destination model on this rank - provide empty metadata + my_dst_metadata = [] all_src_metadata_by_rank = [None] * world_size all_dst_metadata_by_rank = [None] * world_size diff --git a/megatron/core/resharding/refit.py b/megatron/core/resharding/refit.py index 5461b8d3900..2a9e58ecc22 100644 --- a/megatron/core/resharding/refit.py +++ b/megatron/core/resharding/refit.py @@ -7,7 +7,10 @@ - reshard_model_weights: transport-agnostic core; builds/caches plan and executes. """ -from typing import Any, Literal, Optional, Union +from dataclasses import dataclass +from typing import Any, Literal, Optional, Tuple, Union + +import torch from megatron.core import parallel_state from megatron.core.models.common.language_module.language_module import LanguageModule @@ -22,8 +25,69 @@ # Supported refit backend names RefitBackendName = Literal["nccl", "gloo", "nvshmem"] + +@dataclass(frozen=True) +class _PlanCacheKey: + """ + Cache key for reshard plans. + """ + + rank: int + # Parallelism configuration: (TP, PP, EP, DP, expt_tp) or None for non-collocated ranks + src_config: Optional[Tuple[int, int, int, int, int]] + dst_config: Optional[Tuple[int, int, int, int, int]] + num_experts: Optional[int] + + +def _get_config_tuple(core) -> Optional[Tuple[int, int, int, int, int]]: + """Extract (TP, PP, EP, DP, expt_tp) sizes from a model core. + + Returns: + Tuple of (TP, PP, EP, DP, expt_tp) sizes, or None if core is None. + - TP: Tensor parallelism + - PP: Pipeline parallelism + - EP: Expert parallelism + - DP: Data parallelism + - expt_tp: Expert tensor parallelism + """ + if core is None: + return None + pg = core.pg_collection + return ( + len(torch.distributed.get_process_group_ranks(pg.tp)) if pg.tp else 1, + len(torch.distributed.get_process_group_ranks(pg.pp)) if pg.pp else 1, + len(torch.distributed.get_process_group_ranks(pg.ep)) if pg.ep else 1, + len(torch.distributed.get_process_group_ranks(pg.dp)) if pg.dp else 1, + ( + len(torch.distributed.get_process_group_ranks(pg.expt_tp)) + if hasattr(pg, 'expt_tp') and pg.expt_tp + else 1 + ), + ) + + +def _build_plan_cache_key(src_core, tgt_core, num_experts: Optional[int]) -> _PlanCacheKey: + """Build cache key for reshard plan. + + Args: + src_core: Source model core (or None for non-collocated destination/idle ranks) + tgt_core: Target model core (or None for non-collocated source/idle ranks) + num_experts: Number of MoE experts (or None for non-MoE models) + + Returns: + Cache key that uniquely identifies this reshard configuration for this rank + """ + rank = torch.distributed.get_rank() + src_config = _get_config_tuple(src_core) + dst_config = _get_config_tuple(tgt_core) + return _PlanCacheKey( + rank=rank, src_config=src_config, dst_config=dst_config, num_experts=num_experts + ) + + # Module-level cache for refit services to avoid repeated allocations _service_cache: dict[str, CopyService] = {} +_plan_cache: dict[_PlanCacheKey, Any] = {} def get_or_create_service(backend: RefitBackendName) -> CopyService: @@ -53,11 +117,37 @@ def clear_service_cache(): Call this if you need to invalidate the cache, for example when reinitializing distributed state. + + This properly finalizes services to free GPU buffers + before clearing the cache. """ global _service_cache + + # Finalize services to free resources for NVSHMEM backend + # NCCL/Gloo services have no cleanup needed + for backend_name, service in _service_cache.items(): + if hasattr(service, '_remote') and hasattr(service._remote, 'finalize'): + service._remote.finalize() + _service_cache.clear() +def clear_plan_cache(): + """ + Clear the cached refit plans. + """ + global _plan_cache + _plan_cache.clear() + + +def clear_all_caches(): + """ + Clear both service and plan caches. + """ + clear_service_cache() + clear_plan_cache() + + def swap_model_weights( src_model: LanguageModule, target_model: LanguageModule, @@ -82,33 +172,70 @@ def swap_model_weights( def reshard_model_weights( src_model: LanguageModule, target_model: LanguageModule, service: CopyService ): - """Reshard and copy model weights from ``src_model`` to ``target_model`` using ``service``.""" - # Handle list-wrapped modules used throughout training utils - src_lm = src_model[0] if isinstance(src_model, (list, tuple)) else src_model - tgt_lm = target_model[0] if isinstance(target_model, (list, tuple)) else target_model - - num_experts = src_lm.config.num_moe_experts - - # Unwrap to get owning modules (with parameters and pg_collection) - src_core = unwrap_model(src_lm) - tgt_core = unwrap_model(tgt_lm) - - # Ensure pg_collection exists - if not hasattr(src_core, "pg_collection") or src_core.pg_collection is None: - raise RuntimeError("Source model missing pg_collection required for NCCL reshard") - if not hasattr(tgt_core, "pg_collection") or tgt_core.pg_collection is None: - raise RuntimeError("Target model missing pg_collection required for NCCL reshard") - - # Fill missing DP group on the source using Megatron's parallel state if not provided - if getattr(src_core.pg_collection, "dp", None) is None: - src_core.pg_collection.dp = parallel_state.get_data_parallel_group() - - # caching plan for reuse - cached_plan: Optional[Any] = getattr(tgt_core, "_cached_reshard_plan", None) - if cached_plan is None: + """Reshard and copy model weights from ``src_model`` to ``target_model`` using ``service``. + + Supports None for src_model and/or target_model to enable non-collocated mode: + - (src_model, target_model): Both models present (collocated mode) + - (src_model, None): Source rank - only sends data (non-collocated) + - (None, target_model): Destination rank - only receives data (non-collocated) + - (None, None): Idle rank - participates in collectives but has no transfers (non-collocated) + + In non-collocated mode, metadata includes local rank positions within parallel groups, + allowing the planner to correctly map between different process group configurations + without requiring dummy models on every rank. + """ + global _plan_cache + + # Handle idle ranks (both models None) - they participate in collectives but have no work + if src_model is None and target_model is None: + cache_key = _build_plan_cache_key(src_core=None, tgt_core=None, num_experts=None) + + # Use cached plan if available, otherwise build (with collective participation) + if cache_key not in _plan_cache: + plan = build_centralized_reshard_plan(None, None, num_experts=None) + _plan_cache[cache_key] = plan + else: + plan = _plan_cache[cache_key] + execute_reshard_plan(plan, None, None, service=service) + return + + # Handle None models - extract core modules only from non-None models + src_core = None + tgt_core = None + num_experts = None + + if src_model is not None: + # Handle list-wrapped modules + src_lm = src_model[0] if isinstance(src_model, (list, tuple)) else src_model + num_experts = src_lm.config.num_moe_experts + # Unwrap to get owning modules (with parameters and pg_collection) + src_core = unwrap_model(src_lm) + # Ensure pg_collection exists + if not hasattr(src_core, "pg_collection") or src_core.pg_collection is None: + raise RuntimeError("Source model missing pg_collection required for reshard") + # Fill missing DP group on the source using Megatron's parallel state if not provided + if getattr(src_core.pg_collection, "dp", None) is None: + src_core.pg_collection.dp = parallel_state.get_data_parallel_group() + + if target_model is not None: + # Handle list-wrapped modules + tgt_lm = target_model[0] if isinstance(target_model, (list, tuple)) else target_model + if num_experts is None: + num_experts = tgt_lm.config.num_moe_experts + # Unwrap to get owning modules (with parameters and pg_collection) + tgt_core = unwrap_model(tgt_lm) + # Ensure pg_collection exists + if not hasattr(tgt_core, "pg_collection") or tgt_core.pg_collection is None: + raise RuntimeError("Target model missing pg_collection required for reshard") + + # Build or retrieve cached plan + cache_key = _build_plan_cache_key(src_core, tgt_core, num_experts) + + if cache_key not in _plan_cache: + # All ranks must participate in planning (collective operations) plan = build_centralized_reshard_plan(src_core, tgt_core, num_experts=num_experts) - setattr(tgt_core, "_cached_reshard_plan", plan) + _plan_cache[cache_key] = plan else: - plan = cached_plan + plan = _plan_cache[cache_key] execute_reshard_plan(plan, src_core, tgt_core, service=service) diff --git a/megatron/core/resharding/utils.py b/megatron/core/resharding/utils.py index 7fc9e9ad3a7..1dc2f8f85a8 100644 --- a/megatron/core/resharding/utils.py +++ b/megatron/core/resharding/utils.py @@ -324,38 +324,132 @@ def select_src_metadata_balanced( ) -> ParameterMetadata: """Choose a representative source `ParameterMetadata` for a destination rank. - Multiple source data-parallel (DP) groups may hold the same logical parameter. - To avoid always reading from the same group, we: - - bucket `src_meta_list` by their DP group (tuple of ranks) - - if there is only one bucket, just return the first entry - - otherwise, use the destination rank's global rank to select a source - DP group in a round-robin fashion, ensuring even distribution of load - across all source DP groups. + The selected metadata provides topology information (TP/EP/DP group ranks) that the + LCM transfer planner uses to compute actual source ranks and slices. This function + doesn't perform transfers itself - it just picks which source configuration to use + as reference for planning. + + Two scenarios for EP-sharded parameters: + 1. Non-collocated mode (same EP size, different rank numbering): + - Filter by matching EP local rank to pair ranks with same expert position + - Example: src ranks [0-63] and dst ranks [64-127] both with EP=8 + - Dst EP local 0 should use src EP local 0 as reference (same experts) + + 2. Resharding mode (different EP sizes): + - Skip EP local rank filtering (sizes don't correspond) + - Example: EP=8→EP=16 means dst EP local 8 has no matching src EP local + - Expert matching handled by resolved_name; LCM handles TP dimension changes + + Finally, balances across data-parallel (DP) groups to distribute load: + - Groups src_meta_list by DP group + - Selects source DP group via round-robin: dst_rank % num_src_dp_groups + - Ensures even distribution of transfer load across source DP replicas """ if not src_meta_list: raise ValueError("src_meta_list must be non-empty") - # Group source metadata by their DP group layout so we can balance across groups. - # (dp_rank0, dp_rank1, ...) -> [ParameterMetadata for that DP group] + # ============================================================================ + # EXPERT PARALLELISM (EP) LOCAL RANK FILTERING + # ============================================================================ + # Purpose: In non-collocated mode with same EP size, ensure destination ranks + # use source metadata from ranks with the same EP local position (same experts). + # + # Why size check matters: + # - Same size (EP=8→EP=8): Local ranks 0-7 exist in both src and dst + # → Filter ensures dst EP local 0 uses src EP local 0 (same global experts) + # - Different size (EP=8→EP=16): Local ranks 0-15 in dst, only 0-7 in src + # → Dst EP local 8 has no corresponding src EP local rank + # → Skip filter; expert reassignment handled by resolved_name matching + # + # Expert routing: When EP size changes, each expert parameter is matched via + # resolved_name (which includes global expert index). The LCM/TP planner + # handles any TP dimension changes, and DP round-robin distributes load. + # ============================================================================ + dst_ep_group = dst_metadata.expert_parallel_group_ranks + if dst_ep_group is not None: + dst_ep_local = dst_ep_group.index(dst_rank) + # Check if EP sizes match between source and destination + src_ep_size = ( + len(src_meta_list[0].expert_parallel_group_ranks) + if src_meta_list[0].expert_parallel_group_ranks + else None + ) + dst_ep_size = len(dst_ep_group) + + # Only filter by EP local rank when sizes match (non-collocated, not resharding) + if src_ep_size == dst_ep_size: + matching_ep = [ + m + for m in src_meta_list + if m.expert_parallel_group_ranks + and m.expert_parallel_group_ranks.index(m.owner_rank) == dst_ep_local + ] + if not matching_ep: + # This indicates a configuration bug: sizes match but no local rank match + def _ep_local(m): + return ( + m.expert_parallel_group_ranks.index(m.owner_rank) + if m.expert_parallel_group_ranks + else None + ) + + available = [(m.owner_rank, _ep_local(m)) for m in src_meta_list] + raise ValueError( + f"No source metadata with EP local rank {dst_ep_local}" + f" found for dst rank {dst_rank}. Available: {available}" + ) + src_meta_list = matching_ep + # else: EP resharding mode (sizes differ) - skip filter, keep all source candidates + + # ============================================================================ + # LOCAL COPY OPTIMIZATION (COLLOCATED MODE) + # ============================================================================ + # In collocated mode, prefer local copies when available. If dst_rank appears + # in the source metadata list (after TP/EP filtering), use it directly to + # avoid unnecessary data transfers. + # + # A local copy is essentially free + # (tensor.copy_() on same GPU), while any remote transfer incurs significant + # overhead even within the same node. + # ============================================================================ + local_meta = [m for m in src_meta_list if m.owner_rank == dst_rank] + if local_meta: + # Found local metadata - use it for a free local copy + return local_meta[0] + + # ============================================================================ + # DATA PARALLELISM (DP) LOAD BALANCING + # ============================================================================ + # After TP/EP filtering (if applicable), balance transfer load across source + # data-parallel replicas. Each DP group holds a complete copy of the model, + # so we can read from any DP group - choosing via round-robin spreads load. + # + # Load distribution: dst_rank % num_src_dp_groups ensures even distribution + # even when destination has different DP configuration than source. + # ============================================================================ grouped_by_dp: dict[tuple[int, ...], list[ParameterMetadata]] = {} for meta in src_meta_list: dp_group = tuple(meta.data_parallel_group_ranks or []) grouped_by_dp.setdefault(dp_group, []).append(meta) - # Fast path: only one DP layout present; no balancing necessary. + # Fast path: only one DP group present; no balancing necessary if len(grouped_by_dp) == 1: return src_meta_list[0] - # Use the destination rank's global rank to select a source DP group in a - # round-robin fashion. This ensures that even when multiple destination ranks - # have the same DP index (e.g., ranks 0,1,2,3 all being at position 0 in their - # respective DP groups), they still get distributed across different source - # DP groups based on their global rank. + # Round-robin selection across source DP groups based on destination global rank + # This ensures even distribution: if we have 4 src DP groups and 128 dst ranks, + # each src DP group will be selected by 32 dst ranks (128 / 4 = 32) sorted_dp_groups = sorted(grouped_by_dp.keys()) chosen_group = sorted_dp_groups[dst_rank % len(sorted_dp_groups)] - # Within the chosen group, any representative metadata works; use the first. - return grouped_by_dp[chosen_group][0] + # Within the chosen DP group, distribute across available metadata entries + # to balance load across all TP groups in the DP replica. + # Example: With 4 TP groups in a DP group, dst_ranks will cycle through all 4 + # instead of always using the first one, better distributing transfer load. + group_metadata = grouped_by_dp[chosen_group] + within_group_idx = (dst_rank // len(sorted_dp_groups)) % len(group_metadata) + selected = group_metadata[within_group_idx] + return selected logger = logging.getLogger(__name__) diff --git a/megatron/rl/parallel_utils.py b/megatron/rl/parallel_utils.py index 9cab73daba9..da4ee8aa4cf 100644 --- a/megatron/rl/parallel_utils.py +++ b/megatron/rl/parallel_utils.py @@ -21,6 +21,7 @@ def build_inference_pg_collection( ep_size: Optional[int] = None, expt_tp_size: Optional[int] = None, use_tp_pp_dp_mapping: bool = False, + rank_offset: int = 0, ) -> ProcessGroupCollection: """ Build a ProcessGroupCollection for an RL inference model with custom parallelism. @@ -37,6 +38,8 @@ def build_inference_pg_collection( ep_size: Expert parallel size. Defaults to training's EP size. expt_tp_size: Expert tensor parallel size. Defaults to training's expert TP size. use_tp_pp_dp_mapping: If True, use 'tp-pp-dp' order; otherwise 'tp-dp-pp'. + rank_offset: Starting rank when the grid doesn't span the entire communication world. + Used in non-collocated mode where model ranks don't start from 0. Returns: ProcessGroupCollection configured for the inference model. @@ -78,13 +81,15 @@ def build_inference_pg_collection( # Order: tp-cp-pp-dp decoder_grid = HyperCommGrid( [tp_size, cp_size, pp_size, dp_size], - ["tp", "cp", "pp", "dp"] + ["tp", "cp", "pp", "dp"], + rank_offset=rank_offset ) else: # Order: tp-cp-dp-pp (default) decoder_grid = HyperCommGrid( [tp_size, cp_size, dp_size, pp_size], - ["tp", "cp", "dp", "pp"] + ["tp", "cp", "dp", "pp"], + rank_offset=rank_offset ) # Create dense layer groups from decoder_grid @@ -105,13 +110,15 @@ def build_inference_pg_collection( # Order: tp-ep-pp-dp expert_grid = HyperCommGrid( [expt_tp_size, ep_size, pp_size, expt_dp_size], - ["tp", "ep", "pp", "dp"] + ["tp", "ep", "pp", "dp"], + rank_offset=rank_offset ) else: # Order: tp-ep-dp-pp (default) expert_grid = HyperCommGrid( [expt_tp_size, ep_size, expt_dp_size, pp_size], - ["tp", "ep", "dp", "pp"] + ["tp", "ep", "dp", "pp"], + rank_offset=rank_offset ) # Verify PP groups match between decoder and expert grids (required by mpu) diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 3ea43103215..d2b6d52b80a 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -515,7 +515,8 @@ def get_environment_rollouts( optimizer.offload_to_cpu() # If we have seperate training and inference models we to refit weights from the training model to the inference model. - if inference_model is not None: + has_separate_inference_model = inference_model is not None + if has_separate_inference_model: # If the separate inference model weights were prefetched to CPU while idle, bring them # back to GPU before refit/copy and before any CUDA-graph'd inference. with nvtx_range("prefetch-inference-model-weights-to-gpu"): @@ -547,6 +548,7 @@ def get_environment_rollouts( False, # offload optimizer during rollout collection is handled above args.rl_offload_kv_cache_during_training, args.rl_remove_kv_cache_during_training, + training_model=model if has_separate_inference_model else None, ) as inference_interface: with nvtx_range("inference-setup"): @@ -1460,14 +1462,17 @@ def evaluate_and_print_results_rl( optimizer: MegatronOptimizer, iteration: int, write_to_tensorboard: bool = True, + training_model: Optional[list[LanguageModule]] = None, ): """Helper function to evaluate and dump results on screen. Args: data_iterator: Iterator over batches of evaluation dataset. - model: Model to evaluate with. + model: Model to evaluate with (may be separate inference model). iteration: Current training iteration. write_to_tensorboard: Dumpt stuff to tensorboard or not. + training_model: Training model (if separate from inference model). Used to offload + grad buffers and restore to train mode. If None, uses model parameter. """ args = get_args() @@ -1485,6 +1490,7 @@ def evaluate_and_print_results_rl( args.rl_offload_optimizer_during_inference, args.rl_offload_kv_cache_during_training, args.rl_remove_kv_cache_during_training, + training_model, ) as inference_interface: loop = get_asyncio_loop() @@ -1669,17 +1675,20 @@ def megatron_rl_inference_mode( offload_optimizer_during_inference: bool, offload_kv_cache_during_training: bool, remove_kv_cache_during_training: bool, + training_model: Optional[list[LanguageModule]] = None, ): """Manage the model inference context when collecting rollouts. Args: - model: model to prepare. + model: model to prepare for inference (may be separate inference model). optimizer: optimizer used to train the model. cuda_graph_impl: which cuda graph implementation to use. reset_cuda_graphs: rebuild cuda graphs for each inference stage or not. offload_optimizer_during_inference: move optimizer to cpu during inference or not. offload_kv_cache_during_training: manually offload kv cache to host before training or not. remove_kv_cache_during_training: manually remove kv cache before training or not. + training_model: training model (if separate from inference model). Used to offload + grad buffers and restore to train mode. If None, uses model parameter. Yields: None: this context manager does not return a value. @@ -1713,7 +1722,10 @@ def megatron_rl_inference_mode( if offload_optimizer_during_inference: with nvtx_range("offload-optimizer-state-and-grad-buffers-before-inference"): if not args.rl_training_cuda_graphs: - model[0].offload_grad_buffers() + # Offload grad buffers from the training model (if separate inference model is used) + # or from the inference model (if they're the same model) + model_for_grad_offload = training_model if training_model is not None else model + model_for_grad_offload[0].offload_grad_buffers() else: logger.warning( "Gradient buffers will not be offloaded when training cudagraphs are enabled!") @@ -1780,10 +1792,15 @@ def megatron_rl_inference_mode( if offload_optimizer_during_inference: with nvtx_range("onload-optimizer-state-and-grad-buffers-after-inference"): - model[0].restore_grad_buffers() + # Restore grad buffers to the training model (if separate inference model is used) + # or to the inference model (if they're the same model) + model_for_grad_offload = training_model if training_model is not None else model + model_for_grad_offload[0].restore_grad_buffers() optimizer.restore_from_cpu() - lang_module.train() + # Set training model back to train mode (not inference model if they're separate) + training_lang_module = unwrap_model(training_model[0]) if training_model is not None else lang_module + training_lang_module.train() if has_lru_cache: rotary_module.forward.cache_clear() diff --git a/megatron/training/training.py b/megatron/training/training.py index 02599d99ea6..b0b09ede769 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1123,6 +1123,7 @@ def pretrain( prefix = f'iteration {iteration} on validation set' if getattr(args, 'perform_rl_step', False): rl_eval_model = model + rl_training_model = None if inference_model is not None: inf_core = unwrap_model(inference_model[0]) # If separate inference and training models, swap training weights @@ -1130,11 +1131,14 @@ def pretrain( rl_utils._maybe_prefetch_separate_inference_model_weights(inf_core, to_cpu=False) swap_model_weights(model, inference_model, args.refit_method) rl_eval_model = inference_model + rl_training_model = model rl_utils.evaluate_and_print_results_rl( valid_data_iterator, rl_eval_model, optimizer, - iteration, write_to_tensorboard=not args.skip_train + iteration, + write_to_tensorboard=not args.skip_train, + training_model=rl_training_model, ) else: evaluate_and_print_results( @@ -2983,6 +2987,7 @@ def get_e2e_base_metrics(): timers('eval-time', log_level=0).start(barrier=True) if getattr(args, 'perform_rl_step', False): rl_eval_model = model + rl_training_model = None # If separate inference and training models, swap training weights # back to the inference model for RL evaluation. if inference_model is not None: @@ -2992,12 +2997,14 @@ def get_e2e_base_metrics(): ) swap_model_weights(model, inference_model, args.refit_method) rl_eval_model = inference_model + rl_training_model = model rl_utils.evaluate_and_print_results_rl( valid_data_iterator, rl_eval_model, optimizer, iteration, write_to_tensorboard=True, + training_model=rl_training_model, ) else: evaluate_and_print_results(prefix, forward_step_func, diff --git a/tests/unit_tests/resharding/test_communication_scheduler.py b/tests/unit_tests/resharding/test_communication_scheduler.py new file mode 100644 index 00000000000..15d25236740 --- /dev/null +++ b/tests/unit_tests/resharding/test_communication_scheduler.py @@ -0,0 +1,210 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import pytest + +from megatron.core.resharding.nvshmem_copy_service.nvshmem_types import ScheduledBatch +from megatron.core.resharding.nvshmem_copy_service.planning.communication_scheduler import ( + CommunicationScheduler, +) + + +class TestCommunicationScheduler: + """Test suite for CommunicationScheduler.""" + + def test_conflict_detection(self): + """Test that conflicts are detected correctly.""" + scheduler = CommunicationScheduler() + + # Create test batches + batch1 = ScheduledBatch(src_pe=0, dest_pe=1, batch_index=0, iteration=-1, total_size=1000) + batch2 = ScheduledBatch(src_pe=0, dest_pe=2, batch_index=0, iteration=-1, total_size=1000) + batch3 = ScheduledBatch(src_pe=2, dest_pe=3, batch_index=0, iteration=-1, total_size=1000) + batch4 = ScheduledBatch(src_pe=4, dest_pe=5, batch_index=0, iteration=-1, total_size=1000) + + # batch1 and batch2 conflict (share src PE 0) + # batch2 and batch3 conflict (share PE 2) + # batch4 doesn't conflict with any + + batches = [batch1, batch2, batch3, batch4] + scheduler._assign_iterations(batches) + + # batch1 and batch2 should be in different iterations (conflict on PE 0) + assert batch1.iteration != batch2.iteration + + # batch2 and batch3 should be in different iterations (conflict on PE 2) + assert batch2.iteration != batch3.iteration + + # batch4 can be in any iteration (no conflicts) + # Since we process by degree, batch4 (degree 0) will be placed first + assert batch4.iteration == 0 + + def test_degree_based_sorting(self): + """Test that batches are sorted by conflict degree.""" + scheduler = CommunicationScheduler() + + # Create batches with different conflict patterns + # Central hub pattern: PE 0 connects to many others + batches = [ + ScheduledBatch(src_pe=0, dest_pe=1, batch_index=0, iteration=-1, total_size=1000), + ScheduledBatch(src_pe=0, dest_pe=2, batch_index=0, iteration=-1, total_size=1000), + ScheduledBatch(src_pe=0, dest_pe=3, batch_index=0, iteration=-1, total_size=1000), + ScheduledBatch( + src_pe=4, dest_pe=5, batch_index=0, iteration=-1, total_size=1000 + ), # isolated + ] + + scheduler._assign_iterations(batches) + + # Batches involving PE 0 should be scheduled in different iterations + pe0_batches = [b for b in batches if b.src_pe == 0 or b.dest_pe == 0] + iterations = [b.iteration for b in pe0_batches] + # All PE 0 batches should be in different iterations + assert len(iterations) == len(set(iterations)) + + # Isolated batch should be in iteration 0 (no conflicts) + isolated = [b for b in batches if b.src_pe == 4][0] + assert isolated.iteration == 0 + + def test_ring_pattern(self): + """Test scheduling efficiency for ring communication pattern.""" + scheduler = CommunicationScheduler() + + n_pes = 8 + # Ring pattern: each PE sends to next PE (0→1, 1→2, 2→3, ...) + batches = [ + ScheduledBatch( + src_pe=i, dest_pe=(i + 1) % n_pes, batch_index=0, iteration=-1, total_size=1000 + ) + for i in range(n_pes) + ] + + scheduler._assign_iterations(batches) + + # Ring pattern needs 2 iterations because a PE can't send and receive simultaneously + # Iteration 0: Even-indexed PEs send (0→1, 2→3, 4→5, 6→7) + # Iteration 1: Odd-indexed PEs send (1→2, 3→4, 5→6, 7→0) + iterations_used = len(set(b.iteration for b in batches)) + assert iterations_used == 2, f"Ring should use 2 iterations, got {iterations_used}" + + # Verify no conflicts within each iteration + for iteration in range(iterations_used): + iter_batches = [b for b in batches if b.iteration == iteration] + used_pes = set() + for batch in iter_batches: + assert batch.src_pe not in used_pes + assert batch.dest_pe not in used_pes + used_pes.add(batch.src_pe) + used_pes.add(batch.dest_pe) + + def test_all_to_all_pattern(self): + """Test scheduling for all-to-all communication.""" + scheduler = CommunicationScheduler() + + n_pes = 4 + # All-to-all: every PE sends to every other PE + batches = [] + for src in range(n_pes): + for dst in range(n_pes): + if src != dst: + batches.append( + ScheduledBatch( + src_pe=src, dest_pe=dst, batch_index=0, iteration=-1, total_size=1000 + ) + ) + + scheduler._assign_iterations(batches) + + # Verify schedule is conflict-free + for iteration in range(scheduler.num_iterations): + iter_batches = [b for b in batches if b.iteration == iteration] + used_pes = set() + for batch in iter_batches: + # No PE should be used twice in same iteration + assert ( + batch.src_pe not in used_pes + ), f"PE {batch.src_pe} used twice in iteration {iteration}" + assert ( + batch.dest_pe not in used_pes + ), f"PE {batch.dest_pe} used twice in iteration {iteration}" + used_pes.add(batch.src_pe) + used_pes.add(batch.dest_pe) + + def test_empty_workloads(self): + """Test handling of empty workloads.""" + scheduler = CommunicationScheduler() + batches = [] + scheduler._assign_iterations(batches) + assert scheduler.num_iterations == 0 + + def test_single_batch(self): + """Test scheduling with a single batch.""" + scheduler = CommunicationScheduler() + batches = [ + ScheduledBatch(src_pe=0, dest_pe=1, batch_index=0, iteration=-1, total_size=1000) + ] + scheduler._assign_iterations(batches) + assert scheduler.num_iterations == 1 + assert batches[0].iteration == 0 + + def test_no_self_conflict(self): + """Test that a batch doesn't conflict with itself.""" + scheduler = CommunicationScheduler() + + # Single batch should be scheduled in iteration 0 + batches = [ + ScheduledBatch(src_pe=0, dest_pe=1, batch_index=0, iteration=-1, total_size=1000) + ] + scheduler._assign_iterations(batches) + assert batches[0].iteration == 0 + + def test_scatter_pattern(self): + """Test one-to-many scatter pattern.""" + scheduler = CommunicationScheduler() + + n_receivers = 7 + # One PE sends to all others + batches = [ + ScheduledBatch(src_pe=0, dest_pe=i + 1, batch_index=0, iteration=-1, total_size=1000) + for i in range(n_receivers) + ] + + scheduler._assign_iterations(batches) + + # All batches involve PE 0 as sender, so must be in different iterations + iterations_used = len(set(b.iteration for b in batches)) + assert iterations_used == n_receivers + + def test_gather_pattern(self): + """Test many-to-one gather pattern.""" + scheduler = CommunicationScheduler() + + n_senders = 7 + # All PEs send to one PE + batches = [ + ScheduledBatch(src_pe=i, dest_pe=7, batch_index=0, iteration=-1, total_size=1000) + for i in range(n_senders) + ] + + scheduler._assign_iterations(batches) + + # All batches involve PE 7 as receiver, so must be in different iterations + iterations_used = len(set(b.iteration for b in batches)) + assert iterations_used == n_senders + + def test_large_batch_priority(self): + """Test that larger batches get priority (tie-breaking by size).""" + scheduler = CommunicationScheduler() + + # Create batches with different sizes + small_batch = ScheduledBatch( + src_pe=0, dest_pe=1, batch_index=0, iteration=-1, total_size=100 + ) + large_batch = ScheduledBatch( + src_pe=2, dest_pe=3, batch_index=0, iteration=-1, total_size=10000 + ) + + batches = [small_batch, large_batch] + scheduler._assign_iterations(batches) + + # Both should be in iteration 0 (no conflicts) + assert small_batch.iteration == 0 + assert large_batch.iteration == 0 diff --git a/tests/unit_tests/resharding/test_dp_balancing.py b/tests/unit_tests/resharding/test_dp_balancing.py new file mode 100644 index 00000000000..f1f8035171a --- /dev/null +++ b/tests/unit_tests/resharding/test_dp_balancing.py @@ -0,0 +1,344 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import pytest + +from megatron.core.resharding.utils import ParameterMetadata, select_src_metadata_balanced + + +class TestDPBalancing: + """Test suite for DP load balancing.""" + + def _create_metadata(self, rank, tp_group, dp_group, ep_group=None): + """Helper to create ParameterMetadata for testing.""" + return ParameterMetadata( + name="test.weight", + shape=(128, 256), + dtype="float32", + element_size=4, # 4 bytes for float32 + owner_rank=rank, + tensor_parallel_group_ranks=tp_group, + data_parallel_group_ranks=dp_group, + expert_parallel_group_ranks=ep_group, + is_tp=True, + partition_dim=0, + ) + + def test_dp_balancing_basic(self): + """Test basic DP balancing with 2 DP groups.""" + # Setup: TP=2, DP=2, World=4 + # TP groups: [[0,1], [2,3]] + # DP groups: [[0,2], [1,3]] (TP-local-0 and TP-local-1) + + # Source metadata from all ranks + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2]), + self._create_metadata(rank=1, tp_group=[0, 1], dp_group=[1, 3]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2]), + self._create_metadata(rank=3, tp_group=[2, 3], dp_group=[1, 3]), + ] + + # Destination metadata (TP=1, DP=4) + dst_meta = self._create_metadata(rank=0, tp_group=[0], dp_group=[0, 1, 2, 3]) + + # Test each destination rank's selection + selections = {} + for dst_rank in range(4): + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank) + selections[dst_rank] = (selected.owner_rank, tuple(selected.data_parallel_group_ranks)) + + # Verify round-robin across DP groups + # dst_rank 0: 0 % 2 = 0 -> should select DP group [0,2] + # dst_rank 1: 1 % 2 = 1 -> should select DP group [1,3] + # dst_rank 2: 2 % 2 = 0 -> should select DP group [0,2] + # dst_rank 3: 3 % 2 = 1 -> should select DP group [1,3] + assert selections[0][1] in [(0, 2), (1, 3)] # DP group 0 or 1 + assert selections[1][1] in [(0, 2), (1, 3)] + assert selections[2][1] == selections[0][1] # Same as rank 0 + assert selections[3][1] == selections[1][1] # Same as rank 1 + + # Verify different DP groups selected + assert selections[0][1] != selections[1][1] + + def test_dp_balancing_non_collocated(self): + """Test DP balancing in non-collocated mode (dst ranks not in source ranks).""" + # Setup: TP=2, DP=2, World=4 (non-collocated, same config on both sides) + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2]), + self._create_metadata(rank=1, tp_group=[0, 1], dp_group=[1, 3]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2]), + self._create_metadata(rank=3, tp_group=[2, 3], dp_group=[1, 3]), + ] + + # Destination with TP=2 (same as source), dst rank not in source ranks + dst_meta = self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[4, 5]) + + # Should select via DP balancing (no local copy available) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=4) + + # dst_rank=4, 4 % 2 = 0 -> selects from first sorted DP group + assert selected.owner_rank in [0, 1, 2, 3] + + def test_dp_balancing_distribution(self): + """Test that many destination ranks are evenly distributed across source DP groups.""" + # Setup: TP=2, DP=4, World=8 + # TP groups: [[0,1], [2,3], [4,5], [6,7]] + # DP groups: [[0,2,4,6], [1,3,5,7]] + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=1, tp_group=[0, 1], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=3, tp_group=[2, 3], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=5, tp_group=[4, 5], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=6, tp_group=[6, 7], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=7, tp_group=[6, 7], dp_group=[1, 3, 5, 7]), + ] + + dst_meta = self._create_metadata(rank=0, tp_group=[0], dp_group=list(range(8))) + + # Count selections per DP group + dp_group_counts = {} + for dst_rank in range(16): # Test with more dst ranks than src + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank) + dp_group = tuple(selected.data_parallel_group_ranks) + dp_group_counts[dp_group] = dp_group_counts.get(dp_group, 0) + 1 + + # Should have exactly 2 DP groups + assert len(dp_group_counts) == 2 + + # Each should be selected 8 times (16 ranks / 2 groups = 8) + assert all(count == 8 for count in dp_group_counts.values()) + + def test_dp_balancing_with_ep(self): + """Test DP balancing with expert parallelism.""" + # Setup: TP=2, EP=2, DP=2, World=8 + # When EP sizes match, should filter by EP local rank + # + # EP local rank is computed from ep_group.index(owner_rank): + # rank=0 in ep_group=[0, 2] -> EP local 0 + # rank=2 in ep_group=[0, 2] -> EP local 1 + # rank=4 in ep_group=[4, 6] -> EP local 0 + # rank=6 in ep_group=[4, 6] -> EP local 1 + + src_meta_list = [ + # EP local 0 (rank 0 is at index 0 in ep_group [0, 2]) + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 4], ep_group=[0, 2]), + # EP local 1 (rank 2 is at index 1 in ep_group [0, 2]) + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[2, 6], ep_group=[0, 2]), + # EP local 0 (rank 4 is at index 0 in ep_group [4, 6]) + self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 4], ep_group=[4, 6]), + # EP local 1 (rank 6 is at index 1 in ep_group [4, 6]) + self._create_metadata(rank=6, tp_group=[6, 7], dp_group=[2, 6], ep_group=[4, 6]), + ] + + # Destination with same EP size=2, EP local rank = 0 + # (rank 8 is at index 0 in ep_group [8, 9]) + dst_meta = self._create_metadata(rank=8, tp_group=[8, 9], dp_group=[8, 9], ep_group=[8, 9]) + + # When EP sizes match (2->2), should filter by EP local rank + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=8) + + # Should select from EP-local-0 ranks only (0 or 4) + ep_local = selected.expert_parallel_group_ranks.index(selected.owner_rank) + assert ep_local == 0 + assert selected.owner_rank in [0, 4] + + def test_dp_balancing_single_dp_group(self): + """Test fast path when only one DP group exists.""" + # Setup: TP=2, DP=1, World=2 (single DP group) + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 1]), + self._create_metadata(rank=1, tp_group=[0, 1], dp_group=[0, 1]), + ] + + dst_meta = self._create_metadata(rank=0, tp_group=[0], dp_group=[0]) + + # Should hit fast path and return first metadata + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=0) + + # Fast path returns first entry (after any TP/EP filtering) + assert selected == src_meta_list[0] + + def test_tp_size_mismatch_no_filter(self): + """Test DP balancing when TP sizes differ (resharding mode).""" + # Setup: TP=4 -> TP=2 (resharding) + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1, 2, 3], dp_group=[0, 4]), + self._create_metadata(rank=1, tp_group=[0, 1, 2, 3], dp_group=[1, 5]), + self._create_metadata(rank=2, tp_group=[0, 1, 2, 3], dp_group=[2, 6]), + self._create_metadata(rank=3, tp_group=[0, 1, 2, 3], dp_group=[3, 7]), + self._create_metadata(rank=4, tp_group=[4, 5, 6, 7], dp_group=[0, 4]), + self._create_metadata(rank=5, tp_group=[4, 5, 6, 7], dp_group=[1, 5]), + self._create_metadata(rank=6, tp_group=[4, 5, 6, 7], dp_group=[2, 6]), + self._create_metadata(rank=7, tp_group=[4, 5, 6, 7], dp_group=[3, 7]), + ] + + # Destination with TP=2 (different from source TP=4) + dst_meta = self._create_metadata(rank=8, tp_group=[8, 9], dp_group=[8, 9]) + + # Should only do DP balancing (no TP filtering) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=8) + + # Since dst_rank % 4 = 0, should select DP group 0 + # Could be any TP local rank (not filtered) + assert selected.owner_rank in [0, 4] # Both have DP group [0,4] + + def test_ep_size_mismatch_no_filter(self): + """Test that EP filtering is skipped when EP sizes differ.""" + # Setup: EP=4 -> EP=8 (expert parallel resharding) + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0], dp_group=[0, 4], ep_group=[0, 1, 2, 3]), + self._create_metadata(rank=1, tp_group=[1], dp_group=[1, 5], ep_group=[0, 1, 2, 3]), + self._create_metadata(rank=2, tp_group=[2], dp_group=[2, 6], ep_group=[0, 1, 2, 3]), + self._create_metadata(rank=3, tp_group=[3], dp_group=[3, 7], ep_group=[0, 1, 2, 3]), + ] + + # Destination with EP=8 (different from source EP=4) + # EP sizes differ (4 vs 8), so EP filtering should be skipped + dst_meta = self._create_metadata( + rank=8, tp_group=[8], dp_group=[8, 9], ep_group=list(range(8, 16)) + ) + + # Should NOT filter by EP local rank (sizes differ) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=8) + + # Should work without error (no EP filtering when sizes differ) + assert selected.owner_rank in [0, 1, 2, 3] + + def test_load_distribution_across_parameters(self): + """Test that different dst ranks select different DP groups for load balancing.""" + # Setup: TP=1, DP=4, World=4 + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0], dp_group=[0, 1, 2, 3]), + self._create_metadata(rank=1, tp_group=[1], dp_group=[0, 1, 2, 3]), + self._create_metadata(rank=2, tp_group=[2], dp_group=[0, 1, 2, 3]), + self._create_metadata(rank=3, tp_group=[3], dp_group=[0, 1, 2, 3]), + ] + + dst_meta = self._create_metadata(rank=0, tp_group=[0], dp_group=[0, 1, 2, 3, 4, 5, 6, 7]) + + # Simulate 8 destination ranks selecting sources + # Since there's only 1 DP group with 4 members, all should select the same group + # But round-robin based on dst_rank should still distribute across src ranks + selections = [] + for dst_rank in range(8): + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank) + selections.append(selected.owner_rank) + + # All should select the same DP group (only one exists) + # But within that group, should cycle through available ranks + # Since there's only 1 DP group, they all select from it + assert all(rank in [0, 1, 2, 3] for rank in selections) + + def test_within_dp_group_distribution(self): + """Test that dst ranks distribute across source ranks within a DP group.""" + # This tests the optimization: when multiple dst ranks map to the same DP group, + # they should use different source ranks within that group for load balancing. + + # Setup: TP=2, World=8 -> TP=1, World=8 + # Source TP groups: [[0,1], [2,3], [4,5], [6,7]] + # Source DP groups: [[0,2,4,6], [1,3,5,7]] (2 DP replicas) + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=1, tp_group=[0, 1], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=3, tp_group=[2, 3], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=5, tp_group=[4, 5], dp_group=[1, 3, 5, 7]), + self._create_metadata(rank=6, tp_group=[6, 7], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=7, tp_group=[6, 7], dp_group=[1, 3, 5, 7]), + ] + + dst_meta = self._create_metadata(rank=0, tp_group=[0], dp_group=list(range(8))) + + # Test 8 destination ranks + selections = {} + for dst_rank in range(8): + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank) + selections[dst_rank] = ( + selected.owner_rank, + tuple(selected.tensor_parallel_group_ranks), + ) + + # Verify distribution: + # dst_rank 0: DP group 0 ([0,2,4,6]), within-group idx 0 -> rank 0, TP [0,1] + # dst_rank 1: DP group 1 ([1,3,5,7]), within-group idx 0 -> rank 1, TP [0,1] + # dst_rank 2: DP group 0 ([0,2,4,6]), within-group idx 1 -> rank 2, TP [2,3] + # dst_rank 3: DP group 1 ([1,3,5,7]), within-group idx 1 -> rank 3, TP [2,3] + # dst_rank 4: DP group 0 ([0,2,4,6]), within-group idx 2 -> rank 4, TP [4,5] + # dst_rank 5: DP group 1 ([1,3,5,7]), within-group idx 2 -> rank 5, TP [4,5] + # dst_rank 6: DP group 0 ([0,2,4,6]), within-group idx 3 -> rank 6, TP [6,7] + # dst_rank 7: DP group 1 ([1,3,5,7]), within-group idx 3 -> rank 7, TP [6,7] + + assert selections[0] == (0, (0, 1)) + assert selections[1] == (1, (0, 1)) + assert selections[2] == (2, (2, 3)) + assert selections[3] == (3, (2, 3)) + assert selections[4] == (4, (4, 5)) + assert selections[5] == (5, (4, 5)) + assert selections[6] == (6, (6, 7)) + assert selections[7] == (7, (6, 7)) + + # Verify ALL source ranks are used (good load distribution!) + source_ranks_used = {sel[0] for sel in selections.values()} + assert source_ranks_used == {0, 1, 2, 3, 4, 5, 6, 7}, "All source ranks should be used" + + # Verify each TP group used by 2 dst ranks (evenly distributed) + tp_group_usage = {} + for sel in selections.values(): + tp_group = sel[1] + tp_group_usage[tp_group] = tp_group_usage.get(tp_group, 0) + 1 + + # Each of 4 TP groups should be used by exactly 2 destination ranks + assert all(count == 2 for count in tp_group_usage.values()) + assert len(tp_group_usage) == 4 # 4 different TP groups + + def test_local_copy_preference_collocated(self): + """Test that collocated mode prefers local copies when available.""" + # Setup: Collocated TP=2, World=8, DP=4 + # Each rank has both src and dst models with same configuration + # Should always prefer local copy (dst_rank == src_rank) + + # Source metadata from all ranks + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=6, tp_group=[6, 7], dp_group=[0, 2, 4, 6]), + ] + + # Destination rank 0 (collocated - has both src and dst) + dst_meta = self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2, 4, 6]) + + # Should select rank 0 for local copy (not rank 2, 4, or 6) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=0) + assert selected.owner_rank == 0, "Should prefer local copy in collocated mode" + + # Try rank 4 + dst_meta = self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 2, 4, 6]) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=4) + assert selected.owner_rank == 4, "Should prefer local copy for rank 4" + + def test_no_local_copy_non_collocated(self): + """Test that non-collocated mode still uses DP balancing.""" + # Setup: Non-collocated - dst rank 8 not in source ranks [0,2,4,6] + + src_meta_list = [ + self._create_metadata(rank=0, tp_group=[0, 1], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=2, tp_group=[2, 3], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=4, tp_group=[4, 5], dp_group=[0, 2, 4, 6]), + self._create_metadata(rank=6, tp_group=[6, 7], dp_group=[0, 2, 4, 6]), + ] + + # Destination rank 8 (not in source ranks - non-collocated) + dst_meta = self._create_metadata(rank=8, tp_group=[8, 9], dp_group=[8, 9]) + + # Should fall back to DP balancing (not trying to find rank 8 in sources) + selected = select_src_metadata_balanced(src_meta_list, dst_meta, dst_rank=8) + assert selected.owner_rank in [0, 2, 4, 6], "Should select from available source ranks" diff --git a/tests/unit_tests/resharding/test_model_swap.py b/tests/unit_tests/resharding/test_model_swap.py index 73296a175ed..19cb2306bf7 100644 --- a/tests/unit_tests/resharding/test_model_swap.py +++ b/tests/unit_tests/resharding/test_model_swap.py @@ -17,7 +17,7 @@ ) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.resharding.refit import swap_model_weights +from megatron.core.resharding.refit import clear_all_caches, swap_model_weights from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord @@ -295,4 +295,7 @@ def test_swap_gpt_parametrized( dst_logits, ref_logits, atol=1e-4, rtol=1e-4 ), f"Refit src(TP={src_tp},PP={src_pp})->dst(TP={dst_tp},PP={dst_pp}) GPT outputs differ" dist.barrier() + + # Clear refit caches before destroying model parallel to avoid stale plans + clear_all_caches() Utils.destroy_model_parallel() diff --git a/tests/unit_tests/resharding/test_task_segmenter.py b/tests/unit_tests/resharding/test_task_segmenter.py new file mode 100644 index 00000000000..e092f0e1c99 --- /dev/null +++ b/tests/unit_tests/resharding/test_task_segmenter.py @@ -0,0 +1,185 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import pytest + +from megatron.core.resharding.nvshmem_copy_service.nvshmem_types import ( + MAX_SEGMENT_SIZE, + ReceiveRequest, + SendRequest, +) +from megatron.core.resharding.nvshmem_copy_service.planning.task_segmenter import TaskSegmenter + + +class TestTaskSegmenter: + """Test suite for TaskSegmenter.""" + + def test_segment_small_request(self): + """Test segmenting a request smaller than max segment size.""" + segmenter = TaskSegmenter() + + # Request smaller than 256MB should not be segmented + send_req = SendRequest( + task_id=1, src_tensor=None, src_pos=0, size=500 * 1024, dest_pe=1 + ) # 500KB + recv_req = ReceiveRequest( + task_id=1, dest_tensor=None, dest_pos=0, size=500 * 1024, src_pe=0 + ) + + send_segments = segmenter.segment_send_request(send_req) + recv_segments = segmenter.segment_receive_request(recv_req) + + # Should produce exactly one segment (no splitting) + assert len(send_segments) == 1 + assert send_segments[0].task_id == 1 + assert send_segments[0].size == 500 * 1024 + assert send_segments[0].dest_pe == 1 + + assert len(recv_segments) == 1 + assert recv_segments[0].task_id == 1 + assert recv_segments[0].size == 500 * 1024 + + def test_segment_large_request(self): + """Test segmenting a request larger than max segment size.""" + segmenter = TaskSegmenter() + + # Request larger than 256MB should be segmented + task_size = 3 * MAX_SEGMENT_SIZE # 768MB + send_req = SendRequest(task_id=1, src_tensor=None, src_pos=0, size=task_size, dest_pe=1) + + send_segments = segmenter.segment_send_request(send_req) + + # Should produce 3 segments + assert len(send_segments) == 3 + for segment in send_segments: + assert segment.size == MAX_SEGMENT_SIZE # Each segment is max size + assert segment.dest_pe == 1 + + def test_segment_not_exact_multiple(self): + """Test segmenting when size is not exact multiple of max segment size.""" + segmenter = TaskSegmenter() + + # 2.5 × 256MB = 640MB -> should produce 3 segments (256MB, 256MB, 128MB) + task_size = int(2.5 * MAX_SEGMENT_SIZE) + send_req = SendRequest(task_id=1, src_tensor=None, src_pos=0, size=task_size, dest_pe=1) + + send_segments = segmenter.segment_send_request(send_req) + + # Should produce 3 segments + assert len(send_segments) == 3 + # First two segments are full size + assert send_segments[0].size == MAX_SEGMENT_SIZE + assert send_segments[1].size == MAX_SEGMENT_SIZE + # Last segment is remainder + assert send_segments[2].size == int(0.5 * MAX_SEGMENT_SIZE) + + def test_segment_send_and_receive_match(self): + """Test that send and receive segmentation produces matching segments.""" + segmenter = TaskSegmenter() + + task_size = int(2.5 * MAX_SEGMENT_SIZE) + send_req = SendRequest(task_id=1, src_tensor=None, src_pos=0, size=task_size, dest_pe=1) + recv_req = ReceiveRequest(task_id=1, dest_tensor=None, dest_pos=0, size=task_size, src_pe=0) + + send_segments = segmenter.segment_send_request(send_req) + recv_segments = segmenter.segment_receive_request(recv_req) + + # Should produce same number of segments + assert len(send_segments) == len(recv_segments) + + # Sizes should match + for send_seg, recv_seg in zip(send_segments, recv_segments): + assert send_seg.size == recv_seg.size + + def test_segment_very_large_request(self): + """Test segmenting a very large request.""" + segmenter = TaskSegmenter() + + # 10 × 256MB = 2.56GB + task_size = 10 * MAX_SEGMENT_SIZE + send_req = SendRequest(task_id=1, src_tensor=None, src_pos=0, size=task_size, dest_pe=1) + + send_segments = segmenter.segment_send_request(send_req) + + # Should produce 10 segments + assert len(send_segments) == 10 + # All segments should be full size + for segment in send_segments: + assert segment.size == MAX_SEGMENT_SIZE + + def test_segment_zero_size_request(self): + """Test handling of zero-size request.""" + segmenter = TaskSegmenter() + + send_req = SendRequest(task_id=1, src_tensor=None, src_pos=0, size=0, dest_pe=1) + + send_segments = segmenter.segment_send_request(send_req) + + # Should produce one segment with size 0 + assert len(send_segments) == 1 + assert send_segments[0].size == 0 + + def test_segment_exactly_max_size(self): + """Test segmenting request that is exactly max segment size.""" + segmenter = TaskSegmenter() + + # Exactly 256MB - should NOT be segmented + send_req = SendRequest( + task_id=1, src_tensor=None, src_pos=0, size=MAX_SEGMENT_SIZE, dest_pe=1 + ) + + send_segments = segmenter.segment_send_request(send_req) + + # Should produce exactly 1 segment (no splitting needed) + assert len(send_segments) == 1 + assert send_segments[0].size == MAX_SEGMENT_SIZE + + def test_segment_preserves_destination(self): + """Test that segmentation preserves destination PE.""" + segmenter = TaskSegmenter() + + task_size = 2 * MAX_SEGMENT_SIZE + send_req = SendRequest( + task_id=1, src_tensor=None, src_pos=0, size=task_size, dest_pe=42 + ) # Non-standard PE + + send_segments = segmenter.segment_send_request(send_req) + + # All segments should have same destination + for segment in send_segments: + assert segment.dest_pe == 42 + + def test_segment_position_offset(self): + """Test that segments have correct position offsets.""" + segmenter = TaskSegmenter() + + task_size = int(2.5 * MAX_SEGMENT_SIZE) + start_pos = 1000 + send_req = SendRequest( + task_id=1, src_tensor=None, src_pos=start_pos, size=task_size, dest_pe=1 + ) + + send_segments = segmenter.segment_send_request(send_req) + + # Check position offsets + assert send_segments[0].src_pos == start_pos + assert send_segments[1].src_pos == start_pos + MAX_SEGMENT_SIZE + assert send_segments[2].src_pos == start_pos + 2 * MAX_SEGMENT_SIZE + + def test_segment_task_id_encoding(self): + """Test that segments have encoded task IDs.""" + segmenter = TaskSegmenter() + + task_size = 2 * MAX_SEGMENT_SIZE + original_task_id = 42 + send_req = SendRequest( + task_id=original_task_id, src_tensor=None, src_pos=0, size=task_size, dest_pe=1 + ) + + send_segments = segmenter.segment_send_request(send_req) + + # Segments should have encoded task IDs (different from original) + # Based on the encoding: REQUEST_ID_BASE + (task_id * SEGMENT_ID_MULTIPLIER) + segment_index + assert len(send_segments) == 2 + assert send_segments[0].task_id != original_task_id + assert send_segments[1].task_id != original_task_id + # Segment IDs should be different + assert send_segments[0].task_id != send_segments[1].task_id diff --git a/tests/unit_tests/resharding/test_workload_packer.py b/tests/unit_tests/resharding/test_workload_packer.py new file mode 100644 index 00000000000..f8fef6cceb1 --- /dev/null +++ b/tests/unit_tests/resharding/test_workload_packer.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import pytest + +from megatron.core.resharding.nvshmem_copy_service.nvshmem_types import SendRequest +from megatron.core.resharding.nvshmem_copy_service.planning.workload_packer import WorkloadPacker + + +class TestWorkloadPacker: + """Test suite for WorkloadPacker.""" + + def test_pack_single_request(self): + """Test packing a single send request.""" + packer = WorkloadPacker() + + requests = [SendRequest(task_id=1, src_tensor=None, src_pos=0, size=1000, dest_pe=1)] + workloads = packer.pack_workloads(requests, n_pes=2) + + # Should create one batch for destination PE 1 + assert 1 in workloads + assert len(workloads[1]) == 1 + assert workloads[1][0].total_size == 1000 + assert len(workloads[1][0].tasks) == 1 + + def test_pack_multiple_requests_same_dest(self): + """Test packing multiple requests to the same destination.""" + packer = WorkloadPacker() + + requests = [ + SendRequest(task_id=1, src_tensor=None, src_pos=0, size=1000, dest_pe=1), + SendRequest(task_id=2, src_tensor=None, src_pos=0, size=2000, dest_pe=1), + SendRequest(task_id=3, src_tensor=None, src_pos=0, size=3000, dest_pe=1), + ] + workloads = packer.pack_workloads(requests, n_pes=2) + + # All requests fit in one batch (under 256MB default limit) + assert 1 in workloads + assert len(workloads[1]) == 1 + assert workloads[1][0].total_size == 6000 + assert len(workloads[1][0].tasks) == 3 + + def test_pack_exceeds_batch_size(self): + """Test that requests are split when size limit exceeded.""" + packer = WorkloadPacker() + + # Create requests that exceed 256MB limit + mb_256 = 256 * 1024 * 1024 + requests = [ + SendRequest(task_id=1, src_tensor=None, src_pos=0, size=mb_256 - 1000, dest_pe=1), + SendRequest(task_id=2, src_tensor=None, src_pos=0, size=5000, dest_pe=1), + SendRequest(task_id=3, src_tensor=None, src_pos=0, size=2000, dest_pe=1), + ] + workloads = packer.pack_workloads(requests, n_pes=2) + + # Should create 2 batches (first request alone, others together) + assert 1 in workloads + assert len(workloads[1]) == 2 + + def test_pack_multiple_destinations(self): + """Test packing requests to multiple destinations.""" + packer = WorkloadPacker() + + requests = [ + SendRequest(task_id=1, src_tensor=None, src_pos=0, size=1000, dest_pe=1), + SendRequest(task_id=2, src_tensor=None, src_pos=0, size=2000, dest_pe=2), + SendRequest(task_id=3, src_tensor=None, src_pos=0, size=3000, dest_pe=1), + SendRequest(task_id=4, src_tensor=None, src_pos=0, size=4000, dest_pe=3), + ] + workloads = packer.pack_workloads(requests, n_pes=4) + + # PE 1: requests 1 and 3 (4000 total) + assert len(workloads[1]) == 1 + assert workloads[1][0].total_size == 4000 + + # PE 2: request 2 (2000 total) + assert len(workloads[2]) == 1 + assert workloads[2][0].total_size == 2000 + + # PE 3: request 4 (4000 total) + assert len(workloads[3]) == 1 + assert workloads[3][0].total_size == 4000 + + def test_pack_empty_requests(self): + """Test packing with no requests.""" + packer = WorkloadPacker() + workloads = packer.pack_workloads([], n_pes=4) + # All PEs should have empty lists + for pe in range(4): + assert pe in workloads + assert len(workloads[pe]) == 0 + + def test_pack_descending_size_order(self): + """Test that packing sorts by size descending (largest first).""" + packer = WorkloadPacker() + + requests = [ + SendRequest(task_id=1, src_tensor=None, src_pos=0, size=1000, dest_pe=1), + SendRequest(task_id=2, src_tensor=None, src_pos=0, size=5000, dest_pe=1), + SendRequest(task_id=3, src_tensor=None, src_pos=0, size=3000, dest_pe=1), + SendRequest(task_id=4, src_tensor=None, src_pos=0, size=2000, dest_pe=1), + ] + workloads = packer.pack_workloads(requests, n_pes=2) + + # All should be in one batch + assert 1 in workloads + assert len(workloads[1]) == 1 + + # Check that tasks are sorted by size (descending) + sizes = [req.size for req in workloads[1][0].tasks] + assert sizes == sorted(sizes, reverse=True) + + def test_pack_preserves_task_ids(self): + """Test that packing preserves task IDs.""" + packer = WorkloadPacker() + + requests = [ + SendRequest(task_id=100, src_tensor=None, src_pos=0, size=1000, dest_pe=1), + SendRequest(task_id=200, src_tensor=None, src_pos=0, size=2000, dest_pe=1), + SendRequest(task_id=300, src_tensor=None, src_pos=0, size=3000, dest_pe=1), + ] + workloads = packer.pack_workloads(requests, n_pes=2) + + # All requests should be in one batch + assert 1 in workloads + assert len(workloads[1]) == 1 + + # Check task IDs are preserved (sorted by size descending: 300, 200, 100) + task_ids = [req.task_id for req in workloads[1][0].tasks] + assert task_ids == [300, 200, 100] From 554ce493e31d4b96601863df8caee72cb1c21a3f Mon Sep 17 00:00:00 2001 From: xiaoxi-wangfj <690912414@qq.com> Date: Sat, 7 Feb 2026 23:32:30 +0800 Subject: [PATCH 081/231] Fuse permute+pad and unpermute+unpad ops for FP8/FP4 training (#2763) Signed-off-by: xiaoxi-wangfj <690912414@qq.com> Co-authored-by: Xin Yao --- .../core/extensions/transformer_engine.py | 8 +++ megatron/core/transformer/moe/moe_utils.py | 66 ++++++++++++++++--- .../core/transformer/moe/token_dispatcher.py | 25 +++++-- 3 files changed, 83 insertions(+), 16 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 555b60b45d0..996330f5674 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -2435,6 +2435,14 @@ def fused_apply_rotary_pos_emb_thd( fused_sort_chunks_by_index_with_probs = None fused_unpermute = None +try: + from transformer_engine.pytorch.permutation import moe_permute_and_pad_with_probs + + fused_permute_and_pad_with_probs = moe_permute_and_pad_with_probs + +except ImportError: + fused_permute_and_pad_with_probs = None + try: from transformer_engine.pytorch.cross_entropy import parallel_cross_entropy diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 65d8fed1015..47debdd27df 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -17,7 +17,7 @@ from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.router_replay import RouterReplay from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import internal_api +from megatron.core.utils import internal_api, is_te_min_version try: import transformer_engine as te # pylint: disable=unused-import @@ -26,6 +26,7 @@ fused_compute_score_for_moe_aux_loss, fused_moe_aux_loss, fused_permute, + fused_permute_and_pad_with_probs, fused_permute_with_probs, fused_sort_chunks_by_index, fused_sort_chunks_by_index_with_probs, @@ -295,7 +296,15 @@ def permute( num_out_tokens: Optional[int] = None, fused: bool = False, drop_and_pad: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]: + tokens_per_expert: Optional[torch.Tensor] = None, + align_size: int = -1, +) -> Tuple[ + torch.Tensor, + Optional[torch.Tensor], + torch.Tensor, + Optional[torch.Tensor], + Optional[torch.Tensor], +]: """Permute the tokens and probs based on the mask. Tokens with the same designated expert will be grouped together. The shape of mask is [tokens, num_experts], it indicates which experts were selected @@ -304,6 +313,9 @@ def permute( When drop_and_pad=True, in routing_map, the number of non-zeros in each column equals to expert capacity. This function exploits this feature to use ops that support cuda graph. + If the fused permute and pad kernel is available, it will pad the tokens to the align_size + and return the padded permuted tokens, pad_offsets and padded tokens per expert. + Args: tokens (torch.Tensor): The input token tensor, [num_tokens, hidden]. routing_map (torch.Tensor): The sparse token to expert mapping, [num_tokens, num_experts]. @@ -315,10 +327,20 @@ def permute( and pads the number of tokens to the expert capacity. If set to true, routing_map has a fixed number of non-zeros in each column. + tokens_per_expert (torch.Tensor, optional): Tensor of shape `[num_experts]` containing + actual token counts per expert. + align_size (int, optional): The alignment size for the input tensor for fp8 or fp4. Returns: - Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]: - The permuted tokens, permuted probs, and sorted indices. + Tuple[ + torch.Tensor, + Optional[torch.Tensor], + torch.Tensor, + Optional[torch.Tensor], + Optional[torch.Tensor], + ]: + The permuted tokens, (optional) permuted probs, sorted indices, + (optional) pad_offsets, (optional) padded_tokens_per_expert. """ if fused and probs is None: if not HAVE_TE or fused_permute is None: @@ -326,14 +348,26 @@ def permute( permuted_input, sorted_indices = fused_permute( tokens, routing_map, num_out_tokens=num_out_tokens ) - return permuted_input, None, sorted_indices + return permuted_input, None, sorted_indices, None, tokens_per_expert if fused and probs is not None: - if not HAVE_TE or fused_permute_with_probs is None: + if not HAVE_TE or ( + fused_permute_and_pad_with_probs is None and fused_permute_with_probs is None + ): raise ValueError( - "fused_permute_with_probs is not available. Please install TE >= 2.1.0." + "Transformer Engine (TE) fused kernel is not available. " + "fused_permute_with_probs typically requires TE >= 2.1.0, and " + "fused_permute_and_pad_with_probs` typically requires TE >= 2.12.0. " + ) + if fused_permute_and_pad_with_probs is not None and tokens_per_expert is not None: + return fused_permute_and_pad_with_probs( + tokens, probs, routing_map, tokens_per_expert, align_size + ) + else: + output, permuted_probs, row_id_map = fused_permute_with_probs( + tokens, probs, routing_map, num_out_tokens=num_out_tokens ) - return fused_permute_with_probs(tokens, probs, routing_map, num_out_tokens=num_out_tokens) + return output, permuted_probs, row_id_map, None, tokens_per_expert num_tokens, hidden = tokens.shape num_experts = routing_map.shape[1] @@ -376,7 +410,7 @@ def permute( # use the mapping to permute the tokens permuted_input = tokens.index_select(0, sorted_indices) - return permuted_input, permuted_probs, sorted_indices + return permuted_input, permuted_probs, sorted_indices, None, tokens_per_expert def unpermute( @@ -387,6 +421,7 @@ def unpermute( routing_map: Optional[torch.Tensor] = None, fused: bool = False, drop_and_pad: bool = False, + pad_offsets: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Restore the original order of tokens after permutation. If probs are provided, it @@ -408,6 +443,10 @@ def unpermute( fused (bool, optional): Whether use the fused unpermute function. drop_and_pad (bool, optional): Whether or not the token dispatcher uses token-drop and pads the number of tokens to the expert capacity. + pad_offsets (torch.Tensor, optional): + Tensor of per-expert cumulative padding offsets used to remove padding added + during permutation. This is the fourth output of `moe_permute_and_pad_with_probs` + and is required when unpermuting padded outputs. Defaults to None. Returns: torch.Tensor: The tokens restored to their original order. @@ -415,8 +454,15 @@ def unpermute( if fused: if not HAVE_TE or fused_unpermute is None: raise ValueError("fused_unpermute is not available. Please install TE >= 2.1.0.") + extra_kwargs = {} + if is_te_min_version("2.12.0"): + extra_kwargs["pad_offsets"] = pad_offsets return fused_unpermute( - permuted_tokens, sorted_indices, merging_probs=probs, restore_shape=restore_shape + permuted_tokens, + sorted_indices, + merging_probs=probs, + restore_shape=restore_shape, + **extra_kwargs, ) _, hidden = restore_shape diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 1770f9acd41..e682e8ea597 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -294,11 +294,13 @@ def dispatch_postprocess(self, hidden_states, probs): tokens_per_expert = self.local_map.sum(dim=0).long().cpu() - (permuted_local_hidden_states, _, self.reversed_local_input_permutation_mapping) = permute( - hidden_states, - self.local_map, - num_out_tokens=tokens_per_expert.sum().item(), - fused=self.config.moe_permute_fusion, + permuted_local_hidden_states, _, self.reversed_local_input_permutation_mapping, _, _ = ( + permute( + hidden_states, + self.local_map, + num_out_tokens=tokens_per_expert.sum().item(), + fused=self.config.moe_permute_fusion, + ) ) self.local_probs = self.local_probs.T.contiguous().masked_select( @@ -634,6 +636,8 @@ def dispatch_preprocess( permutated_local_input_tokens, permuted_probs, self.reversed_local_input_permutation_mapping, + _, + _, ) = permute( hidden_states, self.routing_map, @@ -1295,12 +1299,20 @@ def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> self.hidden_shape_before_permute = hidden_states.shape assert self.dispatched_probs.dtype == torch.float32, "DeepEP only supports float32 probs" - hidden_states, permuted_probs, self.reversed_mapping_for_combine = permute( + ( + hidden_states, + permuted_probs, + self.reversed_mapping_for_combine, + self.pad_offsets, + self.tokens_per_expert, + ) = permute( hidden_states, self.dispatched_routing_map, probs=self.dispatched_probs, num_out_tokens=self.tokens_per_expert.sum().item(), fused=self.permute_fusion, + tokens_per_expert=self.tokens_per_expert, + align_size=get_align_size_for_quantization(self.config), ) if self.router_dtype == "fp64": permuted_probs = permuted_probs.to(torch.float64) @@ -1313,6 +1325,7 @@ def get_restored_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> restore_shape=self.hidden_shape_before_permute, routing_map=self.dispatched_routing_map, fused=self.permute_fusion, + pad_offsets=self.pad_offsets, ) return hidden_states From 7cbbba2e6a3ee87f2726a8fc003c76a7b2185b0f Mon Sep 17 00:00:00 2001 From: Jianbin Chang Date: Sun, 8 Feb 2026 00:22:13 +0800 Subject: [PATCH 082/231] Add check to prevent MFSDP from numeric issue in gradient accumulate fusion (#2904) Co-authored-by: Xin Yao --- .../core/distributed/fsdp/mcore_fsdp_adapter.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index 5bf543fdc5c..8848d93666e 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -42,7 +42,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer -from megatron.core.utils import log_single_rank +from megatron.core.utils import is_te_min_version, log_single_rank try: from megatron.core.distributed.fsdp.src.megatron_fsdp import FSDPDistributedIndex, MegatronFSDP @@ -85,6 +85,18 @@ def __init__( self.megatron_fsdp_dist_index = self._init_dist_index(pg_collection) + if config.gradient_accumulation_fusion: + assert ( + self.megatron_fsdp_dist_index.get_dp_group(is_expert_parallel=True).size() == 1 + ), ( + "Megatron-FSDP with gradient_accumulation_fusion does not support " + "data parallelism when expert parallelism is enabled." + ) + assert is_te_min_version("2.10"), ( + "Megatron-FSDP with gradient_accumulation_fusion requires " + "Transformer Engine version 2.10 or higher." + ) + self.bucket_size = self.ddp_config.bucket_size if disable_bucketing: self.bucket_size = None From c99c962d36541bd32a3e0476086adcbd8e1d6833 Mon Sep 17 00:00:00 2001 From: c1lovez1 <141424951+c1lovez1@users.noreply.github.com> Date: Sun, 8 Feb 2026 00:23:01 +0800 Subject: [PATCH 083/231] update get_embedding_ranks and get_position_embedding_ranks docstrings (#3223) Co-authored-by: Xin Yao --- megatron/training/training.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/megatron/training/training.py b/megatron/training/training.py index b0b09ede769..d2fa32c0063 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -774,8 +774,14 @@ def pretrain( to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. - get_embedding_ranks (TODO): - get_position_embedding_ranks (TODO): + get_embedding_ranks: a function that takes a list of ranks for a pipeline + group and returns those ranks that should have word embeddings. + For most models, these are the first and last pipeline stages. + If None, defaults to returning the first and last pipeline stages. + get_position_embedding_ranks: a function that takes a list of ranks for + a pipeline group and returns those ranks that should have position + embeddings. For most models, this is only the first pipeline stage. + If None, defaults to returning only the first pipeline stage. non_loss_data_func (callable): A custom function to call during evaluation. It can run e.g. benchmarks. store: an optional instance of torch.distributed.Store, to be used by From 6d81e3df5cf4046260fd2f5e915d37c02be1dca1 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 7 Feb 2026 10:23:12 -0600 Subject: [PATCH 084/231] ci: Add secrets detector (#3180) Signed-off-by: Charlie Truong --- .github/workflows/config/.secrets.baseline | 152 +++++++++++++++++++++ .github/workflows/detect-secrets.yml | 14 ++ 2 files changed, 166 insertions(+) create mode 100644 .github/workflows/config/.secrets.baseline create mode 100644 .github/workflows/detect-secrets.yml diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline new file mode 100644 index 00000000000..faa6f5e60ff --- /dev/null +++ b/.github/workflows/config/.secrets.baseline @@ -0,0 +1,152 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + }, + { + "path": "detect_secrets.filters.regex.should_exclude_file", + "pattern": [ + "pyproject\\.toml|\\.github/workflows/config/\\.secrets\\.baseline|tests/test_utils/recipes/*" + ] + } + ], + "results": { + ".github/workflows/check_api_backwards_compatibility_workflow.yml": [ + { + "type": "Hex High Entropy String", + "filename": ".github/workflows/check_api_backwards_compatibility_workflow.yml", + "hashed_secret": "79813b04048fca379816a7a51848a9d42fe25322", + "is_verified": false, + "line_number": 96 + } + ], + ".github/workflows/cicd-main.yml": [ + { + "type": "Secret Keyword", + "filename": ".github/workflows/cicd-main.yml", + "hashed_secret": "0de7d8c7d76191fdcb236d3c62be9adf20424ca2", + "is_verified": false, + "line_number": 323 + } + ] + }, + "generated_at": "2026-01-30T22:40:44Z" +} diff --git a/.github/workflows/detect-secrets.yml b/.github/workflows/detect-secrets.yml new file mode 100644 index 00000000000..39e05cdad18 --- /dev/null +++ b/.github/workflows/detect-secrets.yml @@ -0,0 +1,14 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Secrets detector + +on: + pull_request: + +jobs: + # If necessary, rebuild the secrets baseline: + # detect-secrets scan --exclude-files \ + # 'pyproject\.toml|\.github/workflows/config/\.secrets\.baseline|tests/test_utils/recipes/*' > \ + # .github/workflows/config/.secrets.baseline + secrets-detector: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_secrets-detector.yml@v0.70.0 From a3ec4b02ede582fb3af23110f3783de081744ae0 Mon Sep 17 00:00:00 2001 From: Zhang Haitao Date: Sun, 8 Feb 2026 00:23:20 +0800 Subject: [PATCH 085/231] Param offset in _ParamAndGradBucket should be aligned (#3007) Signed-off-by: skydoorkai Co-authored-by: lit Co-authored-by: Eric Harper Co-authored-by: Philip Petrakian --- .../core/distributed/param_and_grad_buffer.py | 10 ++-- .../distributed/test_param_and_grad_buffer.py | 53 +++++++++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py index b192f182d9b..088374fbf13 100644 --- a/megatron/core/distributed/param_and_grad_buffer.py +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -79,6 +79,8 @@ class _ParamAndGradBucket: communication. Its application is twofold: it facilitates the averaging of gradients and the scaling of gradients in the context of the Mixture of Experts (MoE) model. bucket_id: Index of bucket in buffer. + param_index_map: Mapping from param to (start, end, bucket_id) in the global buffer. + Used to derive bucket-local offsets for param_to_index. """ def __init__( @@ -90,6 +92,7 @@ def __init__( numel_unpadded: int, gradient_scaling_factor: float, bucket_id: int, + param_index_map: Dict[torch.nn.Parameter, tuple], ): self.params_list = params self.params = set(params) @@ -103,11 +106,11 @@ def __init__( self.numel_unpadded = numel_unpadded self.gradient_scaling_factor = gradient_scaling_factor self.bucket_id = bucket_id + # Derive bucket-local param offsets from the global param_index_map. self.param_to_index = {} - offset = 0 for param in params: - self.param_to_index[param] = (offset, offset + param.numel()) - offset += param.numel() + global_start, global_end, _ = param_index_map[param] + self.param_to_index[param] = (global_start - offset, global_end - offset) class _ParamAndGradBucketGroup: @@ -948,6 +951,7 @@ def _new_bucket( numel_unpadded=numel_unpadded, gradient_scaling_factor=self.gradient_scaling_factor, bucket_id=bucket_id, + param_index_map=self.param_index_map, ) for bucket_param in bucket_params: assert bucket_param not in self.param_to_bucket diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py index 609b2cc5a71..b60dfb1791b 100644 --- a/tests/unit_tests/distributed/test_param_and_grad_buffer.py +++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py @@ -163,6 +163,59 @@ def _pad_param_if_needed(numel_unpadded): Utils.destroy_model_parallel() +def test_param_to_index_alignment_with_padding(): + """Ensure bucket-local param offsets honor padding when DistOpt pads params.""" + Utils.initialize_model_parallel() + + # With input_dim=4, output_dim=4: + # - weight: 4*4 = 16 elements + # - bias: 4 elements + # Since 16 % 64 != 0, the bias must be padded away from the weight, + # making padding observable. + input_dim = 4 + output_dim = 4 + model, param_and_grad_buffer, _ = get_model_and_buffers( + input_dim=input_dim, + output_dim=output_dim, + num_layers=1, + bias=True, + shared_embedding=False, + bucket_size=None, # single bucket + use_distributed_optimizer=True, # enforces 64-element alignment + overlap_grad_reduce=True, + average_in_collective=False, + ) + + bucket = param_and_grad_buffer.buckets[0] + naive_offset = 0 + padding_observed = False + + for param in bucket.params_list: + global_start, global_end, _ = param_and_grad_buffer.param_index_map[param] + expected_local_start = global_start - bucket.offset + expected_local_end = global_end - bucket.offset + local_start, local_end = bucket.param_to_index[param] + + # param_to_index should match the padded offsets used in the global buffer. + assert (local_start, local_end) == (expected_local_start, expected_local_end) + + # At least one param should have been padded relative to naive packing. + if local_start != naive_offset: + padding_observed = True + naive_offset = local_end + + # Verify the slice retrieved via param_to_index matches param.data view. + param_slice = bucket.param_data.view(-1)[local_start:local_end] + torch.testing.assert_close(param_slice, param.data.view(-1)) + + assert padding_observed, ( + "Expected padding to be applied between params. " + "Ensure model dimensions are chosen such that param sizes are not multiples of 64." + ) + + Utils.destroy_model_parallel() + + @pytest.mark.parametrize("use_distributed_optimizer", [False, True]) @pytest.mark.parametrize("overlap_grad_reduce", [False, True]) @pytest.mark.parametrize("average_in_collective", [False, True]) From 916301a51f40d84060f42e49a291824b18b2af16 Mon Sep 17 00:00:00 2001 From: yeyu-nvidia Date: Sun, 8 Feb 2026 21:11:44 -0800 Subject: [PATCH 086/231] updates to support modelopt EAGLE training with CP (#3147) Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- examples/post_training/modelopt/finetune.py | 2 +- examples/post_training/modelopt/finetune.sh | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/post_training/modelopt/finetune.py b/examples/post_training/modelopt/finetune.py index 19ece4ef299..20777ba14c8 100755 --- a/examples/post_training/modelopt/finetune.py +++ b/examples/post_training/modelopt/finetune.py @@ -444,7 +444,7 @@ def get_batch(data_iterator): def non_loss_data_func(model: GPTModel): """Callback to compute the acceptance length.""" args = get_args() - if not args.export_offline_model: + if not args.export_offline_model and args.context_parallel_size == 1: try: report_draft_acceptance_length(model) except Exception as e: diff --git a/examples/post_training/modelopt/finetune.sh b/examples/post_training/modelopt/finetune.sh index e7ba0f022dc..1affc3c04da 100755 --- a/examples/post_training/modelopt/finetune.sh +++ b/examples/post_training/modelopt/finetune.sh @@ -38,6 +38,12 @@ if [ -z ${MLM_DATA_ARGS} ]; then " fi +if [[ -v CP && "$CP" != "1" ]]; then + BACKEND="fused" +else + BACKEND="auto" +fi + if [ -z ${MLM_TRAIN_ARGS} ]; then MLM_TRAIN_ARGS=" \ --no-gradient-accumulation-fusion \ @@ -48,6 +54,7 @@ if [ -z ${MLM_TRAIN_ARGS} ]; then --attention-dropout 0.0 \ --hidden-dropout 0.0 \ --no-check-for-nan-in-loss-and-grad \ + --attention-backend ${BACKEND} \ " fi @@ -83,6 +90,7 @@ ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/finetune.py \ --expert-model-parallel-size ${EP} \ --pipeline-model-parallel-size ${PP} \ --context-parallel-size ${CP} \ + --cp-comm-type p2p \ --tokenizer-model ${TOKENIZER_MODEL} \ --load ${MLM_MODEL_CKPT} \ --save ${MLM_MODEL_SAVE} \ From 6103cb502dad3de2c2a00a3cef38463df6dd75c5 Mon Sep 17 00:00:00 2001 From: Nick Schank Date: Mon, 9 Feb 2026 00:13:06 -0500 Subject: [PATCH 087/231] Ensure type-checker understands use of Submodules in llava_model (#3257) Co-authored-by: Charlie Truong --- megatron/core/models/multimodal/llava_model.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py index af0bcf6e9fd..d3e5d5e26f8 100644 --- a/megatron/core/models/multimodal/llava_model.py +++ b/megatron/core/models/multimodal/llava_model.py @@ -17,8 +17,10 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer import MegatronModule +from megatron.core.transformer.attention import SelfAttentionSubmodules from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules from megatron.core.utils import deprecate_inference_params, log_single_rank try: @@ -158,9 +160,18 @@ def __init__( self.context_parallel_lm = language_transformer_config.context_parallel_size if self.sequence_parallel_lm or self.context_parallel_lm > 1: if not language_model_type.startswith('nemotron5-hybrid'): - attn_module = language_transformer_layer_spec.submodules.self_attention + assert isinstance( + language_transformer_layer_spec.submodules, TransformerLayerSubmodules + ) + assert isinstance( + language_transformer_layer_spec.submodules.self_attention.submodules, + SelfAttentionSubmodules, + ) + attn_submodules = ( + language_transformer_layer_spec.submodules.self_attention.submodules + ) assert ( - attn_module.submodules.core_attention == TEDotProductAttention and HAVE_TE + attn_submodules.core_attention == TEDotProductAttention and HAVE_TE ), "Sequence/Context Parallelism is supported only with TE DotProductAttention." if self.context_parallel_lm > 1: self.cp_group = self.pg_collection.cp From 4ff7686c144fcd3f6ac130e5a97f86a0b8aa4dd5 Mon Sep 17 00:00:00 2001 From: Jianbin Chang Date: Mon, 9 Feb 2026 14:21:28 +0800 Subject: [PATCH 088/231] M-FSDP: Remove redundant stream waits in HSDP to prevent CG fail (#2941) Co-authored-by: Xin Yao --- .../fsdp/src/megatron_fsdp/param_and_grad_buffer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index a0133912069..529ad40d00d 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -3163,9 +3163,6 @@ def wait_for_previous_grad_reduce( grad_reduce_event.wait() free_up_grad_bucket() - if suggested_queue_size == 0 and self.outer_fsdp_group_grad_reduce: - torch.cuda.current_stream().wait_stream(self.outer_fsdp_group_grad_reduce_stream) - def _enforce_double_buffer_limit(self, add_buckets): if not self.buffer.ddp_config.fsdp_double_buffer: return From 32570939300c4377060bc8f13c90275ea7e649fe Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Mon, 9 Feb 2026 08:25:22 +0200 Subject: [PATCH 089/231] fully remove legacy tokenizer system (#2946) Signed-off-by: dimapihtar Co-authored-by: Xin Yao --- docs/user-guide/features/tokenizers.md | 12 +- .../inference/gpt/gpt_dynamic_inference.py | 9 +- .../inference/gpt/gpt_static_inference.py | 14 +- .../inference/t5/simple_t5_batch_inference.py | 17 +- .../mimo/data/energon_avlm_task_encoder.py | 3 +- .../mimo/data/energon_vlm_task_encoder.py | 3 +- examples/run_simple_mcore_train_loop.py | 7 +- megatron/core/datasets/megatron_tokenizer.py | 166 ---- .../core/tokenizers/megatron_tokenizer.py | 73 +- .../tokenizers/text/libraries/__init__.py | 1 + .../text/libraries/sentencepiece_tokenizer.py | 4 + .../text/libraries}/sft_tokenizer.py | 117 ++- .../text/libraries/tiktoken_tokenizer.py | 30 +- .../core/tokenizers/text/text_tokenizer.py | 47 +- .../{text => }/utils/build_tokenizer.py | 53 +- megatron/core/tokenizers/vision/__init__.py | 3 + .../tokenizers/vision/libraries/__init__.py | 8 + .../vision/libraries}/multimodal_tokenizer.py | 115 ++- .../libraries/null_multimodal_tokenizer.py | 90 ++ .../core/tokenizers/vision/models/__init__.py | 3 + .../vision/models/default_tokenizer.py | 12 + .../tokenizers/vision/vision_tokenizer.py | 142 +++ megatron/inference/utils.py | 7 +- megatron/rl/rl_utils.py | 16 +- megatron/training/arguments.py | 3 - megatron/training/global_vars.py | 2 +- megatron/training/tokenizer/__init__.py | 4 - .../training/tokenizer/bert_tokenization.py | 431 -------- .../training/tokenizer/gpt2_tokenization.py | 324 ------ megatron/training/tokenizer/tokenizer.py | 923 ------------------ pretrain_bert.py | 9 +- pretrain_gpt.py | 8 +- pretrain_mamba.py | 8 +- pretrain_t5.py | 10 +- .../bert_mcore_tp1_pp4_vp2/model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 10 + .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../t5/t5_11b_mcore_tp4_pp1/model_config.yaml | 1 - tests/unit_tests/data/test_builder.py | 2 +- tests/unit_tests/data/test_fim_dataset.py | 1 + tests/unit_tests/data/test_gpt_dataset.py | 6 +- .../data/test_multimodal_dataset.py | 7 +- tests/unit_tests/data/test_preprocess_data.py | 9 +- .../export/trtllm/test_distributed_fp8.py | 9 +- .../export/trtllm/test_single_device_fp8.py | 9 +- tests/unit_tests/rl/test_rl_utils.py | 1 + tests/unit_tests/test_inference.py | 14 +- tests/unit_tests/test_tokenizer.py | 278 ------ tests/unit_tests/test_training.py | 16 +- tests/unit_tests/tokenizers/test_tokenizer.py | 100 +- tools/checkpoint/saver_base.py | 4 +- tools/checkpoint/saver_legacy.py | 4 +- tools/preprocess_data.py | 21 +- tools/preprocess_data_nmt.py | 2 +- tools/preprocess_mmdata.py | 14 +- 57 files changed, 799 insertions(+), 2387 deletions(-) delete mode 100644 megatron/core/datasets/megatron_tokenizer.py rename megatron/{training/tokenizer => core/tokenizers/text/libraries}/sft_tokenizer.py (73%) rename megatron/core/tokenizers/{text => }/utils/build_tokenizer.py (60%) create mode 100644 megatron/core/tokenizers/vision/__init__.py create mode 100644 megatron/core/tokenizers/vision/libraries/__init__.py rename megatron/{training/tokenizer => core/tokenizers/vision/libraries}/multimodal_tokenizer.py (86%) create mode 100644 megatron/core/tokenizers/vision/libraries/null_multimodal_tokenizer.py create mode 100644 megatron/core/tokenizers/vision/models/__init__.py create mode 100644 megatron/core/tokenizers/vision/models/default_tokenizer.py create mode 100644 megatron/core/tokenizers/vision/vision_tokenizer.py delete mode 100644 megatron/training/tokenizer/__init__.py delete mode 100644 megatron/training/tokenizer/bert_tokenization.py delete mode 100644 megatron/training/tokenizer/gpt2_tokenization.py delete mode 100644 megatron/training/tokenizer/tokenizer.py delete mode 100644 tests/unit_tests/test_tokenizer.py diff --git a/docs/user-guide/features/tokenizers.md b/docs/user-guide/features/tokenizers.md index 0aecf8df8a7..77b165f5269 100644 --- a/docs/user-guide/features/tokenizers.md +++ b/docs/user-guide/features/tokenizers.md @@ -141,7 +141,7 @@ Use a null tokenizer for testing or non-text models: ```python tokenizer = MegatronTokenizer.from_pretrained( - metadata_path={"library": "null"}, + metadata_path={"library": "null-text"}, vocab_size=131072, ) ``` @@ -173,16 +173,6 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \ If `--tokenizer-metadata` is not specified, a default metadata file is generated automatically based on the tokenizer type. -### Legacy Tokenizer Support - -The old tokenizer system is still supported for backward compatibility: - -```bash -torchrun --nproc_per_node=8 pretrain_gpt.py \ - --legacy-tokenizer \ - ... -``` - ## Supported Tokenizer Libraries | Library | Description | Use Case | diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 7fcac70c11a..122a956d986 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -34,7 +34,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.inference.utils import ( add_inference_args, get_inference_config_from_model_and_args, @@ -257,10 +257,9 @@ def main(): configure_nvtx_profiling(True) args = get_args() - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + + # Build tokenizer + tokenizer = build_tokenizer(args) # Reset peak memory stats so functional tests measure this run and not # whatever happened earlier during initialization. diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index 298ebfebd86..17cf7c53b05 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -17,7 +17,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule sys.path.append( @@ -64,10 +64,7 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere Returns: AbstractBackend: The chosen backend """ - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + tokenizer = build_tokenizer(args) inference_context = StaticInferenceContext( args.inference_max_requests, args.inference_max_seq_length ) @@ -149,10 +146,9 @@ def main(): top_n_logprobs=args.top_n_logprobs, ) - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) + requests = build_requests(args, tokenizer) prompts = [r.prompt_text for r in requests] diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py index 4b15952e07f..3ac8b5dcf86 100644 --- a/examples/inference/t5/simple_t5_batch_inference.py +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + import os import sys from argparse import Namespace @@ -17,7 +19,7 @@ from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import ( EncoderDecoderTextGenerationController, ) -from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule from pretrain_t5 import model_provider @@ -77,10 +79,8 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi Returns: AbstractBackend: The chosen backend """ - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) inference_wrapper_config = InferenceWrapperConfig( hidden_size=args.hidden_size, @@ -131,10 +131,9 @@ def main(): num_tokens_to_generate=args.num_tokens_to_generate, ) - if args.legacy_tokenizer: - tokenizer = get_tokenizer() - else: - tokenizer = build_tokenizer(args) + # Build tokenizer + tokenizer = build_tokenizer(args) + decoder_prompts = [""] * len( args.encoder_prompts ) # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty diff --git a/examples/mimo/data/energon_avlm_task_encoder.py b/examples/mimo/data/energon_avlm_task_encoder.py index 32afb1b2cfb..a6a86761720 100644 --- a/examples/mimo/data/energon_avlm_task_encoder.py +++ b/examples/mimo/data/energon_avlm_task_encoder.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import logging import os @@ -39,7 +41,6 @@ ) from megatron.energon.task_encoder.base import stateless from megatron.training import get_args -from megatron.training.tokenizer.multimodal_tokenizer import mistral_custom_template IMAGE_TOKEN = "" AUDIO_TOKEN = "