From 751089126dc42e21dbcf00f1267249e4e5b5a6b1 Mon Sep 17 00:00:00 2001 From: DanielSun11 <1395924413@qq.com> Date: Sat, 30 May 2026 13:35:22 +0800 Subject: [PATCH 1/2] Support use_magic_weight_init --- .../transformer/transformer_config.py | 25 +++- src/paddlefleet/utils.py | 14 ++ .../test_transformer_config.py | 121 ++++++++++++++++++ 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/src/paddlefleet/transformer/transformer_config.py b/src/paddlefleet/transformer/transformer_config.py index aae895082..7c7d5f805 100644 --- a/src/paddlefleet/transformer/transformer_config.py +++ b/src/paddlefleet/transformer/transformer_config.py @@ -18,13 +18,18 @@ from __future__ import annotations import functools +import math from dataclasses import dataclass from typing import TYPE_CHECKING, Literal import paddle.nn.functional as F from ..model_parallel_config import ModelParallelConfig -from ..utils import init_method_normal, scaled_init_method_normal +from ..utils import ( + erniecore_init_method_normal, + init_method_normal, + scaled_init_method_normal, +) if TYPE_CHECKING: from collections.abc import Callable @@ -757,6 +762,9 @@ class TransformerConfig(ModelParallelConfig): routing_map_fusion: bool = False """If True, use Triton fused routing map kernel for MoE routing.""" + use_magic_weight_init: bool = False + """Use the same parameter initialization method as ernie-core, with aligned distribution and variance.""" + # Field name mapping rules: HuggingFace config.json name -> TransformerConfig name transform_rules = { # DSA field mapping @@ -867,7 +875,11 @@ def __post_init__(self): # init_method is not None self.embedding_init_method = self.init_method - if self.init_method is None: + if self.use_magic_weight_init: + sigma = math.sqrt(0.3333 / self.hidden_size) + self.init_method = erniecore_init_method_normal(sigma) + self.init_method_std = sigma + elif self.init_method is None: self.init_method = init_method_normal(self.init_method_std) if ( @@ -924,7 +936,9 @@ def __post_init__(self): "recompute_granularity must be one of full and selective" ) - if self.output_layer_init_method is None: + if self.use_magic_weight_init: + self.output_layer_init_method = self.init_method + elif self.output_layer_init_method is None: self.output_layer_init_method = scaled_init_method_normal( self.init_method_std, self.num_hidden_layers, @@ -936,7 +950,10 @@ def __post_init__(self): # By default, use the same init std as you use for every other non-output layer. self.embedding_init_method_std = self.init_method_std - if self.embedding_init_method is None: + if self.use_magic_weight_init: + self.embedding_init_method = self.init_method + self.embedding_init_method_std = self.init_method_std + elif self.embedding_init_method is None: if self.init_method is None or ( self.embedding_init_method_std != self.init_method_std ): diff --git a/src/paddlefleet/utils.py b/src/paddlefleet/utils.py index a41def207..44dc1da8f 100644 --- a/src/paddlefleet/utils.py +++ b/src/paddlefleet/utils.py @@ -143,6 +143,20 @@ def scaled_init_method_normal(sigma, num_layers, multiplier=2.0): return functools.partial(paddle.nn.init.normal_, mean=0.0, std=std) +def erniecore_init_method_normal(sigma): + """Init method aligned with ernie-core: randn(...).scale(sigma) under fp32 default dtype guard.""" + + def init_method(weight): + dtype = paddle.get_default_dtype() + paddle.set_default_dtype("float32") + weight.set_value( + paddle.randn(weight.shape, dtype=weight.dtype).scale(sigma) + ) + paddle.set_default_dtype(dtype) + + return init_method + + def get_pg_size(group=None): """Get world size for a distributed group. diff --git a/tests/single_card_tests/test_transformer_config.py b/tests/single_card_tests/test_transformer_config.py index c47b87285..8529a5127 100644 --- a/tests/single_card_tests/test_transformer_config.py +++ b/tests/single_card_tests/test_transformer_config.py @@ -189,5 +189,126 @@ def test_hybridep_dispatcher_type_is_preserved(self): self.assertTrue(config.moe_use_fusion_node) +class TestUseMagicWeightInit(unittest.TestCase): + """Tests for the use_magic_weight_init functionality in TransformerConfig.""" + + def test_use_magic_weight_init_false_default_behavior(self): + """When use_magic_weight_init is False (default), normal init methods should be used.""" + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=768, + use_magic_weight_init=False, + ) + # When False, init_method should be set but not the magic init + self.assertIsNotNone(config.init_method) + self.assertIsNotNone(config.output_layer_init_method) + + def test_use_magic_weight_init_true_sigma_calculation(self): + """When use_magic_weight_init is True, sigma should be sqrt(0.3333 / hidden_size).""" + import math + + hidden_size = 768 + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=hidden_size, + use_magic_weight_init=True, + ) + expected_sigma = math.sqrt(0.3333 / hidden_size) + self.assertAlmostEqual(config.init_method_std, expected_sigma, places=6) + + def test_use_magic_weight_init_true_all_methods_same(self): + """When use_magic_weight_init is True, all init methods should be the same.""" + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=768, + use_magic_weight_init=True, + ) + # All init methods should be the same function + self.assertIs(config.init_method, config.output_layer_init_method) + self.assertIs(config.init_method, config.embedding_init_method) + + def test_use_magic_weight_init_true_different_hidden_sizes(self): + """Test sigma calculation with different hidden sizes.""" + import math + + for hidden_size in [512, 768, 1024, 2048, 4096]: + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=hidden_size, + use_magic_weight_init=True, + ) + expected_sigma = math.sqrt(0.3333 / hidden_size) + self.assertAlmostEqual( + config.init_method_std, expected_sigma, places=6 + ) + + def test_use_magic_weight_init_true_init_method_matches_erniecore(self): + """When use_magic_weight_init is True, init method should match erniecore_init_method_normal.""" + import math + + from paddlefleet.utils import erniecore_init_method_normal + + hidden_size = 768 + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=hidden_size, + use_magic_weight_init=True, + ) + + # Create test weight + weight = paddle.randn([100, 100]) + + # Apply config's init method + config.init_method(weight) + + # Calculate expected using erniecore_init_method_normal + expected_sigma = math.sqrt(0.3333 / hidden_size) + erniecore_init = erniecore_init_method_normal(expected_sigma) + expected_weight = paddle.randn([100, 100]) + erniecore_init(expected_weight) + + # Compare results using same random seed + paddle.seed(1234) + weight1 = paddle.randn([100, 100]) + config.init_method(weight1) + + paddle.seed(1234) + weight2 = paddle.randn([100, 100]) + erniecore_init(weight2) + + paddle.testing.assert_close(weight1, weight2, rtol=1e-6, atol=1e-6) + + def test_use_magic_weight_init_false_uses_normal_init(self): + """When use_magic_weight_init is False, normal init methods should be used.""" + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=768, + use_magic_weight_init=False, + ) + # Should have init_method_std set to normal value + self.assertIsNotNone(config.init_method_std) + # Should be a reasonable value for normal init (not the magic init value) + import math + + magic_sigma = math.sqrt(0.3333 / 768) + self.assertNotAlmostEqual(config.init_method_std, magic_sigma, places=6) + + def test_use_magic_weight_init_true_with_moe(self): + """Test use_magic_weight_init works correctly with MoE models.""" + import math + + config = TransformerConfig( + num_hidden_layers=12, + hidden_size=768, + n_routed_experts=8, + use_magic_weight_init=True, + ) + expected_sigma = math.sqrt(0.3333 / 768) + self.assertAlmostEqual(config.init_method_std, expected_sigma, places=6) + # All init methods should still be the same + self.assertIs(config.init_method, config.output_layer_init_method) + self.assertIs(config.init_method, config.embedding_init_method) + + if __name__ == "__main__": unittest.main() From 2924574c6cf796ad2b15a43508ef81880a37b00d Mon Sep 17 00:00:00 2001 From: DanielSun11 <1395924413@qq.com> Date: Sat, 30 May 2026 23:04:48 +0800 Subject: [PATCH 2/2] refine --- .../transformer/transformer_config.py | 18 +++-- src/paddlefleet/utils.py | 7 +- .../test_transformer_config.py | 66 +++++++++++-------- 3 files changed, 52 insertions(+), 39 deletions(-) diff --git a/src/paddlefleet/transformer/transformer_config.py b/src/paddlefleet/transformer/transformer_config.py index 7c7d5f805..6a1cca780 100644 --- a/src/paddlefleet/transformer/transformer_config.py +++ b/src/paddlefleet/transformer/transformer_config.py @@ -26,7 +26,7 @@ from ..model_parallel_config import ModelParallelConfig from ..utils import ( - erniecore_init_method_normal, + get_magic_init_method, init_method_normal, scaled_init_method_normal, ) @@ -762,8 +762,8 @@ class TransformerConfig(ModelParallelConfig): routing_map_fusion: bool = False """If True, use Triton fused routing map kernel for MoE routing.""" - use_magic_weight_init: bool = False - """Use the same parameter initialization method as ernie-core, with aligned distribution and variance.""" + magic_init: bool = False + """Use the magic initialization method.""" # Field name mapping rules: HuggingFace config.json name -> TransformerConfig name transform_rules = { @@ -875,9 +875,13 @@ def __post_init__(self): # init_method is not None self.embedding_init_method = self.init_method - if self.use_magic_weight_init: + if self.magic_init: + if self.hidden_size == 0: + raise ValueError( + "hidden_size must be non-zero when magic_init is True." + ) sigma = math.sqrt(0.3333 / self.hidden_size) - self.init_method = erniecore_init_method_normal(sigma) + self.init_method = get_magic_init_method(sigma) self.init_method_std = sigma elif self.init_method is None: self.init_method = init_method_normal(self.init_method_std) @@ -936,7 +940,7 @@ def __post_init__(self): "recompute_granularity must be one of full and selective" ) - if self.use_magic_weight_init: + if self.magic_init: self.output_layer_init_method = self.init_method elif self.output_layer_init_method is None: self.output_layer_init_method = scaled_init_method_normal( @@ -950,7 +954,7 @@ def __post_init__(self): # By default, use the same init std as you use for every other non-output layer. self.embedding_init_method_std = self.init_method_std - if self.use_magic_weight_init: + if self.magic_init: self.embedding_init_method = self.init_method self.embedding_init_method_std = self.init_method_std elif self.embedding_init_method is None: diff --git a/src/paddlefleet/utils.py b/src/paddlefleet/utils.py index 44dc1da8f..378787402 100644 --- a/src/paddlefleet/utils.py +++ b/src/paddlefleet/utils.py @@ -143,16 +143,13 @@ def scaled_init_method_normal(sigma, num_layers, multiplier=2.0): return functools.partial(paddle.nn.init.normal_, mean=0.0, std=std) -def erniecore_init_method_normal(sigma): - """Init method aligned with ernie-core: randn(...).scale(sigma) under fp32 default dtype guard.""" +def get_magic_init_method(sigma): + """Magic init method: randn(...).scale(sigma) under fp32 default dtype guard.""" def init_method(weight): - dtype = paddle.get_default_dtype() - paddle.set_default_dtype("float32") weight.set_value( paddle.randn(weight.shape, dtype=weight.dtype).scale(sigma) ) - paddle.set_default_dtype(dtype) return init_method diff --git a/tests/single_card_tests/test_transformer_config.py b/tests/single_card_tests/test_transformer_config.py index 8529a5127..11a66a368 100644 --- a/tests/single_card_tests/test_transformer_config.py +++ b/tests/single_card_tests/test_transformer_config.py @@ -189,45 +189,45 @@ def test_hybridep_dispatcher_type_is_preserved(self): self.assertTrue(config.moe_use_fusion_node) -class TestUseMagicWeightInit(unittest.TestCase): - """Tests for the use_magic_weight_init functionality in TransformerConfig.""" +class TestMagicInit(unittest.TestCase): + """Tests for the magic_init functionality in TransformerConfig.""" - def test_use_magic_weight_init_false_default_behavior(self): - """When use_magic_weight_init is False (default), normal init methods should be used.""" + def test_magic_init_false_default_behavior(self): + """When magic_init is False (default), normal init methods should be used.""" config = TransformerConfig( num_hidden_layers=12, hidden_size=768, - use_magic_weight_init=False, + magic_init=False, ) # When False, init_method should be set but not the magic init self.assertIsNotNone(config.init_method) self.assertIsNotNone(config.output_layer_init_method) - def test_use_magic_weight_init_true_sigma_calculation(self): - """When use_magic_weight_init is True, sigma should be sqrt(0.3333 / hidden_size).""" + def test_magic_init_true_sigma_calculation(self): + """When magic_init is True, sigma should be sqrt(0.3333 / hidden_size).""" import math hidden_size = 768 config = TransformerConfig( num_hidden_layers=12, hidden_size=hidden_size, - use_magic_weight_init=True, + magic_init=True, ) expected_sigma = math.sqrt(0.3333 / hidden_size) self.assertAlmostEqual(config.init_method_std, expected_sigma, places=6) - def test_use_magic_weight_init_true_all_methods_same(self): - """When use_magic_weight_init is True, all init methods should be the same.""" + def test_magic_init_true_all_methods_same(self): + """When magic_init is True, all init methods should be the same.""" config = TransformerConfig( num_hidden_layers=12, hidden_size=768, - use_magic_weight_init=True, + magic_init=True, ) # All init methods should be the same function self.assertIs(config.init_method, config.output_layer_init_method) self.assertIs(config.init_method, config.embedding_init_method) - def test_use_magic_weight_init_true_different_hidden_sizes(self): + def test_magic_init_true_different_hidden_sizes(self): """Test sigma calculation with different hidden sizes.""" import math @@ -235,24 +235,24 @@ def test_use_magic_weight_init_true_different_hidden_sizes(self): config = TransformerConfig( num_hidden_layers=12, hidden_size=hidden_size, - use_magic_weight_init=True, + magic_init=True, ) expected_sigma = math.sqrt(0.3333 / hidden_size) self.assertAlmostEqual( config.init_method_std, expected_sigma, places=6 ) - def test_use_magic_weight_init_true_init_method_matches_erniecore(self): - """When use_magic_weight_init is True, init method should match erniecore_init_method_normal.""" + def test_magic_init_true_init_method_matches_get_magic_init_method(self): + """When magic_init is True, init method should match get_magic_init_method.""" import math - from paddlefleet.utils import erniecore_init_method_normal + from paddlefleet.utils import get_magic_init_method hidden_size = 768 config = TransformerConfig( num_hidden_layers=12, hidden_size=hidden_size, - use_magic_weight_init=True, + magic_init=True, ) # Create test weight @@ -261,11 +261,11 @@ def test_use_magic_weight_init_true_init_method_matches_erniecore(self): # Apply config's init method config.init_method(weight) - # Calculate expected using erniecore_init_method_normal + # Calculate expected using get_magic_init_method expected_sigma = math.sqrt(0.3333 / hidden_size) - erniecore_init = erniecore_init_method_normal(expected_sigma) + magic_init = get_magic_init_method(expected_sigma) expected_weight = paddle.randn([100, 100]) - erniecore_init(expected_weight) + magic_init(expected_weight) # Compare results using same random seed paddle.seed(1234) @@ -274,16 +274,16 @@ def test_use_magic_weight_init_true_init_method_matches_erniecore(self): paddle.seed(1234) weight2 = paddle.randn([100, 100]) - erniecore_init(weight2) + magic_init(weight2) paddle.testing.assert_close(weight1, weight2, rtol=1e-6, atol=1e-6) - def test_use_magic_weight_init_false_uses_normal_init(self): - """When use_magic_weight_init is False, normal init methods should be used.""" + def test_magic_init_false_uses_normal_init(self): + """When magic_init is False, normal init methods should be used.""" config = TransformerConfig( num_hidden_layers=12, hidden_size=768, - use_magic_weight_init=False, + magic_init=False, ) # Should have init_method_std set to normal value self.assertIsNotNone(config.init_method_std) @@ -293,15 +293,15 @@ def test_use_magic_weight_init_false_uses_normal_init(self): magic_sigma = math.sqrt(0.3333 / 768) self.assertNotAlmostEqual(config.init_method_std, magic_sigma, places=6) - def test_use_magic_weight_init_true_with_moe(self): - """Test use_magic_weight_init works correctly with MoE models.""" + def test_magic_init_true_with_moe(self): + """Test magic_init works correctly with MoE models.""" import math config = TransformerConfig( num_hidden_layers=12, hidden_size=768, n_routed_experts=8, - use_magic_weight_init=True, + magic_init=True, ) expected_sigma = math.sqrt(0.3333 / 768) self.assertAlmostEqual(config.init_method_std, expected_sigma, places=6) @@ -309,6 +309,18 @@ def test_use_magic_weight_init_true_with_moe(self): self.assertIs(config.init_method, config.output_layer_init_method) self.assertIs(config.init_method, config.embedding_init_method) + def test_magic_init_true_raises_on_zero_hidden_size(self): + """When magic_init is True and hidden_size is 0, should raise ValueError.""" + with self.assertRaises( + ValueError, + msg="hidden_size must be non-zero when magic_init is True.", + ): + TransformerConfig( + num_hidden_layers=12, + hidden_size=0, + magic_init=True, + ) + if __name__ == "__main__": unittest.main()