PaddlePaddle
diff --git a/‎llm/auto_parallel/gpt-3/run_pretrain_auto.py
Lines changed: 0 additions & 4 deletions b/‎llm/auto_parallel/gpt-3/run_pretrain_auto.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 0 additions & 4 deletions b/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎llm/auto_parallel/qwen/run_pretrain_3D_auto.py
Lines changed: 0 additions & 4 deletions b/‎llm/auto_parallel/qwen/run_pretrain_3D_auto.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎paddlenlp/trainer/auto_training_args.py
Lines changed: 4 additions & 1 deletion b/‎paddlenlp/trainer/auto_training_args.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddlenlp/trainer/unified_checkpoint/load_local.py
Lines changed: 5 additions & 6 deletions b/‎paddlenlp/trainer/unified_checkpoint/load_local.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎paddlenlp/trainer/utils/ckpt_converter.py
Lines changed: 89 additions & 11 deletions b/‎paddlenlp/trainer/utils/ckpt_converter.py
Lines changed: 89 additions & 11 deletions
diff --git a/‎paddlenlp/transformers/__init__.py
Lines changed: 90 additions & 0 deletions b/‎paddlenlp/transformers/__init__.py
Lines changed: 90 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/auto/configuration.py
Lines changed: 2 additions & 0 deletions b/‎paddlenlp/transformers/auto/configuration.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/auto/modeling.py
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/transformers/auto/modeling.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/transformers/auto/tokenizer.py
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/transformers/auto/tokenizer.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/transformers/xlm_roberta/__init__.py
Lines changed: 17 additions & 0 deletions b/‎paddlenlp/transformers/xlm_roberta/__init__.py
Lines changed: 17 additions & 0 deletions
@@ -91,10 +91,6 @@ class PreTrainingArguments(AutoTrainingArguments):
         default=False,
         metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
-    use_intermediate_api: bool = field(
-        default=False,
-        metadata={"help": "Weather to use auto_parallel intermediate api"},
-    )
 
     def __post_init__(self):
         super().__post_init__()
 
@@ -100,10 +100,6 @@ class PreTrainingArguments(AutoTrainingArguments):
         default=False,
         metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
-    use_intermediate_api: bool = field(
-        default=False,
-        metadata={"help": "Weather to use auto_parallel intermediate api"},
-    )
 
     def __post_init__(self):
         super().__post_init__()
 
@@ -106,10 +106,6 @@ class PreTrainingArguments(AutoTrainingArguments):
         default=False,
         metadata={"help": "whether use lazy init for model parameters"},
     )
-    use_intermediate_api: bool = field(
-        default=False,
-        metadata={"help": "Weather to use auto_parallel intermediate api"},
-    )
 
     def __post_init__(self):
         super().__post_init__()
 
@@ -47,7 +47,10 @@ class AutoTrainingArguments(TrainingArguments):
             "help": "Enable eliminate_transpose pass, which should replace transpose with reshape when sequence parallel is enabled."
         },
     )
-
+    use_intermediate_api: bool = field(
+        default=False,
+        metadata={"help": "Weather to use auto_parallel intermediate api"},
+    )
     refined_ops_patterns: str = field(default=None, metadata={"help": "The pattern of refined recompute."})
 
     def __post_init__(self):
 
@@ -282,16 +282,15 @@ def load_resolved_archive_file(
         returned_optim_state_dict[key_name] = state_dict_optim.pop(key)
         returned_optim_state_dict[key_name].name = key_name
 
-        # master weight cast (only in remove_master_weight)
-        if has_master_weights and state_dict_master_weight[model_weight_key].dtype != paddle.float32:
-            state_dict_master_weight[model_weight_key] = paddle.cast(
-                state_dict_master_weight[model_weight_key], dtype=paddle.float32
-            )
-
     if has_master_weights:
         for key in list(state_dict_master_weight.keys()):
             static_name = struct2static_name_mappings[key]
             returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight.pop(key)
+            # master weight cast (only in remove_master_weight)
+            if returned_optim_state_dict["master_weights"][static_name].dtype != paddle.float32:
+                returned_optim_state_dict["master_weights"][static_name] = paddle.cast(
+                    returned_optim_state_dict["master_weights"][static_name], dtype=paddle.float32
+                )
             returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
 
     return returned_optim_state_dict
@@ -41,7 +41,13 @@
 
 class CheckpointConverter:
     def __init__(
-        self, hybrid_parallel_ckpt_path, state_dict, parameter_to_structured_name, trainging_args=None, patch_dict=None
+        self,
+        hybrid_parallel_ckpt_path,
+        state_dict,
+        parameter_to_structured_name,
+        trainging_args=None,
+        patch_dict=None,
+        local_view_pattern: list | bool = None,
     ):
         self.use_dist = True if paddle.distributed.get_world_size() > 1 else False
         self.path = hybrid_parallel_ckpt_path
@@ -85,6 +91,17 @@ def __init__(
                 self.auto_parallel_state_dict[self.patch_dict[k]] = self.auto_parallel_state_dict[k]
             for k in del_keys:
                 self.auto_parallel_state_dict.pop(k)
+        # solve the problem of inconsistent parameter names in moe automatic parallel mode.
+        if hasattr(trainging_args, "moe_group") and trainging_args.moe_group:
+            if local_view_pattern is False:
+                self.local_view_pattern_list = None
+            else:
+                if isinstance(local_view_pattern, list):
+                    self.local_view_pattern_list = local_view_pattern
+                else:
+                    self.local_view_pattern_list = ["experts"]
+        else:
+            self.local_view_pattern_list = None
 
         flags = [
             ["tp degree", self.tp_degree],
@@ -497,6 +514,46 @@ def gen_metadata_and_prepare_source_state_dict(self):
             else:
                 return self.gen_metadata_for_tp_sharded_tensor()
 
+    def rename_local_view_state_dict(self, state_dict, file_name):
+        """
+        Rename the key for local views to the key for global views, and return the renamed `state_dict`.
+        """
+        if self.local_view_pattern_list is None:
+            return state_dict
+        # case 1: moe_group is mp_group
+        if self.tp_degree > 1 and self.sharding_degree <= 1:
+            (tp_rank, pp_rank, sharding_rank) = self.get_distribution_rank_from_file_name(file_name)
+            expert_name_old2new = {}
+            for pattern in self.local_view_pattern_list:
+                expert_pattern = rf"({pattern}\.)(\d+)"
+                # extract all experts IDs
+                expert_ids = set()
+                for state_name in state_dict.keys():
+                    res = re.search(expert_pattern, state_name)
+                    if res:
+                        expert_ids.add(int(res.group(2)))
+                expert_num = len(expert_ids)
+                # construct old name to new name mapping
+                for state_name in state_dict.keys():
+                    res = re.search(expert_pattern, state_name)
+                    if res:
+                        new_expert_id = int(res.group(2)) % expert_num + tp_rank * expert_num
+                        expert_name_old2new[state_name] = re.sub(
+                            expert_pattern, f"{res.group(1)}{new_expert_id}", state_name
+                        )
+            # rename state_dict
+            renamed_state_dict = {
+                expert_name_old2new[state_name]
+                if state_name in expert_name_old2new
+                else state_name: state_dict[state_name]
+                for state_name in state_dict.keys()
+            }
+
+            return renamed_state_dict
+        # TODO: add support for sharding
+        else:
+            return state_dict
+
     def load_state_dict_and_rename(self):
         """
         Parse the distributed information from the names of the checkpoint files and evenly parse out the distributed information for each weight/optimizer state
@@ -741,11 +798,10 @@ def load_state_dict_and_rename(self):
                         model_state_file_name = self.get_model_state_file_from(file_name)
                         assert model_state_file_name is not None
                         model_state_keys = global_file_to_state_dict_keys_mapping[model_state_file_name]
-                        renamed_state_dict = self.rename_using_optimizer_state_order(model_state_keys, state_dict)
-                        self.get_sharded_tensor_infos(file, renamed_state_dict, cur_rank_sharded_tensor_infos)
-                        self.cur_rank_loaded_state_dict[file_name] = renamed_state_dict
-                    else:
-                        self.get_sharded_tensor_infos(file_name, state_dict, cur_rank_sharded_tensor_infos)
+                        state_dict = self.rename_using_optimizer_state_order(model_state_keys, state_dict)
+                    renamed_state_dict = self.rename_local_view_state_dict(state_dict, file_name)
+                    self.get_sharded_tensor_infos(file_name, renamed_state_dict, cur_rank_sharded_tensor_infos)
+                    self.cur_rank_loaded_state_dict[file_name] = renamed_state_dict
             else:
                 for file, state_dict in self.cur_rank_loaded_state_dict.items():
                     # The rule for renaming is to change the master_weights name in the optimizer state to the model weight name,
@@ -897,6 +953,9 @@ def rename(old_name, parameter_to_structured_name):
             return None
 
         for key, value in state_dict.items():
+            # NOTE: Skip the parameters that are not initialized，which are not in the current rank.
+            if value is None or (isinstance(value, paddle.Tensor) and not value._is_initialized()):
+                continue
             if key in parameter_to_structured_name.values():
                 new_name = key
             else:
@@ -909,7 +968,9 @@ def rename(old_name, parameter_to_structured_name):
     def rename_using_optimizer_state_order(self, model_state_keys, optimizer_state_dict):
         name_mapping = {}
         suffix_bucket = {}
-        assert len(optimizer_state_dict) % len(model_state_keys) == 0
+        # TODO: After adapting to sharding, remove the code below.
+        if self.is_sharding_stage3 or (self.sharding_degree > 1 and self.sharding_stage1_v == 2):
+            assert len(optimizer_state_dict) % len(model_state_keys) == 0
         for suffix in OPTIMIZER_STATE_NAME_SUFFIX:
             suffix_bucket[suffix] = []
         for opt_name, opt_value in optimizer_state_dict.items():
@@ -927,10 +988,27 @@ def rename_using_optimizer_state_order(self, model_state_keys, optimizer_state_d
         for suffix, old_names in suffix_bucket.items():
             if len(old_names) == 0:
                 continue
-            assert len(old_names) == len(model_state_keys)
-            for i in range(len(old_names)):
-                name_mapping[old_names[i]] = model_state_keys[i] + suffix
-
+            # TODO: After adapting to sharding, remove the code below.
+            if self.is_sharding_stage3 or (self.sharding_degree > 1 and self.sharding_stage1_v == 2):
+                assert len(old_names) == len(model_state_keys)
+
+            # NOTE: Handle the case where the number of master_weight elements is not equal to the number of model_state_keys.
+            if suffix != ".master_weight":
+                for i in range(len(old_names)):
+                    name_mapping[old_names[i]] = model_state_keys[i] + suffix
+            else:
+                for i in range(len(old_names)):
+                    param = old_names[i][:-14]
+                    index = -1
+                    for idx, opt_name in enumerate(suffix_bucket[".moment1"]):
+                        if param == opt_name[:-24]:
+                            index = idx
+                            break
+                    if index >= 0:
+                        name_mapping[old_names[i]] = model_state_keys[index] + suffix
+                    else:
+                        raise RuntimeError(f"Can't find {param} in optimizer state dict.")
+        # rename state dict
         renamed_state_dict = {}
         for k, v in optimizer_state_dict.items():
             renamed_state_dict[name_mapping[k]] = v
 
@@ -306,6 +306,96 @@
 from .unimo.configuration import *
 from .unimo.modeling import *
 from .unimo.tokenizer import *
+from .unimo.configuration import *
+from .xlnet.modeling import *
+from .xlnet.tokenizer import *
+from .xlnet.configuration import *
+from .xlm.modeling import *
+from .xlm.tokenizer import *
+from .xlm.configuration import *
+from .xlm_roberta.modeling import *
+from .xlm_roberta.tokenizer import *
+from .xlm_roberta.configuration import *
+from .gau_alpha.modeling import *
+from .gau_alpha.tokenizer import *
+from .gau_alpha.configuration import *
+from .gemma import *
+from .roformerv2.modeling import *
+from .roformerv2.tokenizer import *
+from .roformerv2.configuration import *
+from .optimization import *
+from .opt.configuration import *
+from .opt.modeling import *
+from .auto.modeling import *
+from .auto.tokenizer import *
+from .auto.processing import *
+from .auto.image_processing import *
+from .auto.configuration import *
+from .codegen.modeling import *
+from .codegen.tokenizer import *
+from .codegen.configuration import *
+from .artist.modeling import *
+from .artist.tokenizer import *
+from .artist.configuration import *
+from .dallebart.modeling import *
+from .dallebart.tokenizer import *
+from .dallebart.configuration import *
+from .clip.modeling import *
+from .clip.configuration import *
+from .clip.feature_extraction import *
+from .clip.tokenizer import *
+from .clip.processing import *
+from .clip.image_processing import *
+from .chineseclip.modeling import *
+from .chineseclip.configuration import *
+from .chineseclip.feature_extraction import *
+from .chineseclip.processing import *
+from .chineseclip.image_processing import *
+from .chineseclip.tokenizer import *
+from .gptj.modeling import *
+from .gptj.tokenizer import *
+from .gptj.configuration import *
+from .pegasus.modeling import *
+from .pegasus.tokenizer import *
+from .pegasus.configuration import *
+from .glm.configuration import *
+from .glm.modeling import *
+from .glm.tokenizer import *
+from .nystromformer.configuration import *
+from .nystromformer.modeling import *
+from .nystromformer.tokenizer import *
+from .bloom.configuration import *
+from .bloom.modeling import *
+from .bloom.tokenizer import *
+from .bloom.tokenizer_fast import *
+from .clipseg.configuration import *
+from .clipseg.modeling import *
+from .clipseg.processing import *
+from .clipseg.image_processing import *
+from .blip_2.modeling import *
+from .blip_2.configuration import *
+from .blip_2.processing import *
+from .chatglm.configuration import *
+from .chatglm.modeling import *
+from .chatglm.tokenizer import *
+from .chatglm_v2.configuration import *
+from .chatglm_v2.modeling import *
+from .chatglm_v2.modeling_pp import *
+from .chatglm_v2.tokenizer import *
+from .speecht5.configuration import *
+from .speecht5.modeling import *
+from .speecht5.tokenizer import *
+from .speecht5.processing import *
+from .speecht5.feature_extraction import *
+from .minigpt4.modeling import *
+from .minigpt4.configuration import *
+from .minigpt4.processing import *
+from .minigpt4.image_processing import *
+from .clap.configuration import *
+from .clap.feature_extraction import *
+from .clap.modeling import *
+from .clap.processing import *
+from .visualglm.modeling import *
 from .visualglm.configuration import *
 from .visualglm.image_processing import *
 from .visualglm.modeling import *
 
@@ -115,6 +115,7 @@
         ("unimo", "UNIMOConfig"),
         ("visualglm", "VisualGLMConfig"),
         ("xlm", "XLMConfig"),
+        ("xlm-roberta", "XLMRobertaConfig"),
         ("xlnet", "XLNetConfig"),
         ("yuan", "YuanConfig"),
     ]
@@ -206,6 +207,7 @@
         ("unimo", "UNIMO"),
         ("visualglm", "VisualGLM"),
         ("xlm", "XLM"),
+        ("xlm-roberta", "XLMRoberta"),
         ("xlnet", "XLNet"),
         ("yuan", "Yuan"),
     ]
 
@@ -96,6 +96,7 @@
         ("UNIMO", "unimo"),
         ("XLNet", "xlnet"),
         ("XLM", "xlm"),
+        ("XLMRoberta", "xlm_roberta"),
         ("GPT", "gpt"),
         ("GLM", "glm"),
         ("MT5", "mt5"),
 
@@ -115,6 +115,7 @@
             ("squeezebert", "SqueezeBertTokenizer"),
             ("t5", "T5Tokenizer"),
             ("xlm", "XLMTokenizer"),
+            ("xlm_roberta", "XLMRobertaTokenizer"),
             ("xlnet", "XLNetTokenizer"),
             ("bert_japanese", "BertJapaneseTokenizer"),
             ("bigbird", "BigBirdTokenizer"),
 
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .tokenizer import *