diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py index 252bb77fe15e..c50a6f5e0476 100644 --- a/paddlenlp/transformers/deepseek_v2/modeling.py +++ b/paddlenlp/transformers/deepseek_v2/modeling.py @@ -2238,6 +2238,13 @@ def _init_weights(self, layer): if isinstance(layer, MoEGate): kaiming_uniform_(layer.weight, a=math.sqrt(5)) + moe_grad_group = fleet.get_hybrid_communicate_group().expert_grad_comm_group + if moe_grad_group is not None and moe_grad_group.nranks > 1: + for p in layer.parameters(): + if hasattr(p, "color") and "color" in p.color: + if p.color["color"] == "moe_expert": + paddle.distributed.broadcast(p, src=moe_grad_group.ranks[0], group=moe_grad_group) + def step_flex_token(self, cur_step): set_global_step(cur_step)