From 0da8c5d121139c3dd900260b48e9b242d7335608 Mon Sep 17 00:00:00 2001
From: lugimzzz <63761690+lugimzzz@users.noreply.github.com>
Date: Fri, 12 Apr 2024 13:39:31 +0800
Subject: [PATCH 01/27] cherry-pick add scaling (#8264)

---
 paddlenlp/peft/lora/lora_config.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/paddlenlp/peft/lora/lora_config.py b/paddlenlp/peft/lora/lora_config.py
index 3a0897bc4892..12e3b929ed7e 100644
--- a/paddlenlp/peft/lora/lora_config.py
+++ b/paddlenlp/peft/lora/lora_config.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import json
+import math
 import os
 from dataclasses import asdict, dataclass, field
 from typing import List, Optional, Union
@@ -94,6 +95,15 @@ def __post_init__(self):
             )
             self.use_quick_lora = False
 
+    @property
+    def scaling(self):
+        if not self.rslora and not self.pissa:
+            return self.lora_alpha / self.r
+        elif self.pissa:
+            return 1.0
+        else:
+            return self.lora_alpha / math.sqrt(self.r)
+
     @property
     def __dict__(self):
         return asdict(self)
@@ -114,6 +124,7 @@ def save_pretrained(self, save_directory):
         os.makedirs(save_directory, exist_ok=True)
 
         output_dict = self.__dict__
+        output_dict["scaling"] = self.scaling
         output_path = os.path.join(save_directory, LORA_CONFIG_NAME)
 
         # save it
@@ -136,7 +147,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             raise ValueError(f"Can't find lora_config.json at '{pretrained_model_name_or_path}'")
 
         loaded_attributes = cls.from_json_file(config_file)
-
+        loaded_attributes.pop("scaling", None)
         config = cls(**kwargs)
 
         for key, value in loaded_attributes.items():

From 4749af30726f39d4e733de023599e9bc5b438f87 Mon Sep 17 00:00:00 2001
From: w5688414 <w5688414@gmail.com>
Date: Fri, 12 Apr 2024 16:42:27 +0800
Subject: [PATCH 02/27] Upgrade paddlenlp to 2.8.0 (#8266)

* Upgrade paddlenlp to 2.8.0

* fix try import

* Add regex to requirements.txt
---
 paddlenlp/__init__.py | 2 +-
 requirements.txt      | 1 +
 setup.py              | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py
index af011febd291..e3cd7e1c5f75 100644
--- a/paddlenlp/__init__.py
+++ b/paddlenlp/__init__.py
@@ -18,7 +18,7 @@
 PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION"
 
 
-__version__ = "2.7.1.post"
+__version__ = "2.8.0.post"
 if os.getenv(PADDLENLP_STABLE_VERSION):
     __version__ = __version__.replace(".post", "")
 
diff --git a/requirements.txt b/requirements.txt
index 4b676d900563..71fee6049318 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,4 @@ safetensors
 tool_helpers
 aistudio-sdk>=0.1.3
 jinja2
+regex
diff --git a/setup.py b/setup.py
index ab8f03a292e1..0723cfc28cb4 100644
--- a/setup.py
+++ b/setup.py
@@ -109,7 +109,7 @@ def show():
         f.write(content)
 
 
-__version__ = "2.7.1.post"
+__version__ = "2.8.0.post"
 if os.getenv(PADDLENLP_STABLE_VERSION):
     __version__ = __version__.replace(".post", "")
 

From 6c1f4493654d42f3ef7eb604e5839f6c8bfda855 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Mon, 15 Apr 2024 16:45:23 +0800
Subject: [PATCH 03/27] [BugFix] Try except sequence parallel utils (#8189)
 (#8274)

* try except sp

* fix sp import
---
 .../gpt/dygraph/hybrid_model.py               | 17 +++++++------
 .../models/language_model/language_module.py  |  9 ++++---
 paddlenlp/transformers/__init__.py            | 24 +++++++++++--------
 paddlenlp/transformers/gpt/modeling.py        | 18 ++++++++------
 paddlenlp/transformers/gpt/modeling_auto.py   | 12 ++++++----
 paddlenlp/transformers/gpt/modeling_pp.py     | 10 +++++---
 paddlenlp/transformers/llama/modeling.py      | 17 +++++++------
 .../mc2_seqence_parallel_linear.py            | 12 ++++++----
 paddlenlp/transformers/mixtral/modeling.py    | 17 +++++++------
 9 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
index f47d800c5f15..f4c1ee8d46a7 100644
--- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
+++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
@@ -48,13 +48,16 @@
     MinLengthLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
 )
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.segment_parallel_utils  import ReshardLayer
 
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py
index 1a73a35982ff..c86fa300e352 100644
--- a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py
+++ b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py
@@ -24,9 +24,12 @@
 from ppfleetx.core.module.basic_module import BasicModule
 from ppfleetx.data.tokenizers import GPTTokenizer
 from ppfleetx.distributed.apis import env
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    register_sequence_parallel_allreduce_hooks,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
 from ppfleetx.utils.log import logger
 
 # TODO(haohongxiang): to solve the problem of cross-reference
diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
index 2ee9d7733f41..05fa5775399e 100644
--- a/paddlenlp/transformers/__init__.py
+++ b/paddlenlp/transformers/__init__.py
@@ -29,16 +29,20 @@
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .image_processing_utils import ImageProcessingMixin
 from .attention_utils import create_bigbird_rand_mask_idx_list
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    GatherOp,
-    ScatterOp,
-    AllGatherOp,
-    ReduceScatterOp,
-    ColumnSequenceParallelLinear,
-    RowSequenceParallelLinear,
-    mark_as_sequence_parallel_parameter,
-    register_sequence_parallel_allreduce_hooks,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        AllGatherOp,
+        ReduceScatterOp,
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+        mark_as_sequence_parallel_parameter,
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
 from .export import export_model
 
 # isort: split
diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
index 50cfc892d336..8c066431979f 100644
--- a/paddlenlp/transformers/gpt/modeling.py
+++ b/paddlenlp/transformers/gpt/modeling.py
@@ -29,13 +29,17 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from paddle.utils import try_import
 
diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py
index 255763be395f..2e508339ab39 100644
--- a/paddlenlp/transformers/gpt/modeling_auto.py
+++ b/paddlenlp/transformers/gpt/modeling_auto.py
@@ -30,10 +30,14 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from ...utils.converter import StateDictNameMapping
 from .. import PretrainedModel, register_base_model
diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py
index 3ec6b004edee..cd3dce018378 100644
--- a/paddlenlp/transformers/gpt/modeling_pp.py
+++ b/paddlenlp/transformers/gpt/modeling_pp.py
@@ -19,9 +19,13 @@
     SharedLayerDesc,
 )
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.model_utils import PipelinePretrainedModel
 
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index d4da1b195a94..b0b08c30241a 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -45,13 +45,16 @@ def swiglu(x, y=None):
         return F.silu(x) * y
 
 
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 from paddle.utils import try_import
 
 from paddlenlp.transformers.conversion_utils import (
diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py
index 7d669833e690..c39a78cc6252 100644
--- a/paddlenlp/transformers/mc2_seqence_parallel_linear.py
+++ b/paddlenlp/transformers/mc2_seqence_parallel_linear.py
@@ -23,10 +23,14 @@
 
 from paddle import distributed as dist
 from paddle.autograd import PyLayer
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    RowSequenceParallelLinear,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+    pass
 
 __all_gather_recomputation__ = False
 if int(os.getenv("MC2_Recompute", 0)):
diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py
index 592f9a47847a..7a8254d6877c 100644
--- a/paddlenlp/transformers/mixtral/modeling.py
+++ b/paddlenlp/transformers/mixtral/modeling.py
@@ -33,13 +33,16 @@
 except ImportError:
     fused_rotary_position_embedding = None
 
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.conversion_utils import (
     StateDictNameMapping,

From dc5a6af3d33dc09890c00d900dd2bd8f0bf617c4 Mon Sep 17 00:00:00 2001
From: gongenlei <gongenlei@baidu.com>
Date: Mon, 22 Apr 2024 01:00:19 +0000
Subject: [PATCH 04/27] save_model: checkpoint_done --> model_done

---
 paddlenlp/trainer/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index be1af93c50fd..d8487364076b 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -2096,10 +2096,10 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op
             if not self.is_in_train:
                 self.args.unified_checkpoint_config = unified_checkpoint_config_backup
         if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
-            # save checkpoint_done file to ensure checkpoint is complete
+            # save model_done file to ensure model is complete
             if self.args.should_save_model_state and self.args.should_save:
                 # For ckpt integrity
-                paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))
+                paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done"))
 
     def _save_checkpoint(self, model, metrics=None):
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"

From 7314063128336138191f04c332a34929901f810c Mon Sep 17 00:00:00 2001
From: yujun <573009727@qq.com>
Date: Tue, 23 Apr 2024 15:29:22 +0800
Subject: [PATCH 05/27] fix import

---
 paddlenlp/peft/lora/lora_layers.py | 272 ++++++++++++++++++++++++++++-
 paddlenlp/peft/lora/lora_model.py  |  73 ++++++++
 2 files changed, 344 insertions(+), 1 deletion(-)

diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
index e0c79c47a87a..66a0d0c0f520 100644
--- a/paddlenlp/peft/lora/lora_layers.py
+++ b/paddlenlp/peft/lora/lora_layers.py
@@ -27,11 +27,44 @@
 
 from .lora_quick_layers import quick_lora
 
-if "npu" in paddle.device.get_all_custom_device_type():
+
+def is_mc2_valid():
+    return "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0"))
+
+
+if is_mc2_valid():
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        MC2ColumnSeqParallelLinear,
+        MC2RowSeqParallelLinear,
+    )
+
     from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear
 else:
     MC2LoRaRowParallelLinear = None
     MC2LoRaColumnParallelLinear = None
+    MC2ColumnSeqParallelLinear = None
+    MC2RowSeqParallelLinear = None
+
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        AllGatherOp,
+        ColumnSequenceParallelLinear,
+        ReduceScatterOp,
+        RowSequenceParallelLinear,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+
+    class ColumnSequenceParallelLinear:
+        pass
+
+    class RowSequenceParallelLinear:
+        pass
+
+    AllGatherOp = None
+    ReduceScatterOp = None
+    mark_as_sequence_parallel_parameter = None
 
 
 class LoRALinear(nn.Linear):
@@ -298,6 +331,123 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        merge_weights: bool = True,
+        use_quick_lora: bool = False,
+        pissa: bool = False,
+        **kwargs
+    ):
+        RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        if pissa:
+            raise ValueError("Pissa is not supported in model parallel by now")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[self.input_size_per_partition, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+            ),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, self.out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_A.is_distributed = True
+        self.lora_A.split_axis = 0
+        self.lora_B.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_B)
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def train(self):
+        super().train()
+        if self.merge_weights and self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def eval(self):
+        super().eval()
+        if self.merge_weights and not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
+        else:
+            input_mp = x
+
+        if not is_mc2_valid():
+            output_parallel = self.linear(input_mp, self.weight, name=self._name)
+            output_ = ReduceScatterOp.apply(output_parallel)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+        else:
+            output_ = MC2RowSeqParallelLinear.apply(input_mp, self.weight, self.model_parallel_group)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+
+        if not self.merged:
+            input_mp = self.lora_dropout(input_mp)
+            if not is_mc2_valid():
+                input_mp = input_mp @ self.lora_A
+                input_mp = ReduceScatterOp.apply(input_mp)
+            else:
+                input_mp = MC2RowSeqParallelLinear.apply(input_mp, self.lora_A, self.model_parallel_group)
+            delta_mp = (input_mp @ self.lora_B) * self.scaling
+            result_mp += delta_mp
+        return result_mp
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
 class ColumnParallelLoRALinear(ColumnParallelLinear):
     def __init__(
         self,
@@ -428,6 +578,126 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        merge_weights: bool = True,
+        lora_A_weight_attr: Optional[paddle.ParamAttr] = None,
+        use_quick_lora: bool = False,
+        pissa: bool = False,
+        **kwargs
+    ):
+        ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        if pissa:
+            raise ValueError("Pissa is not supported in model parallel by now")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=lora_A_weight_attr,
+        )
+        self.lora_A.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_A)
+
+        self.lora_B = self.create_parameter(
+            shape=[r, self.output_size_per_partition],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_B.is_distributed = True
+        self.lora_B.split_axis = 1
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def train(self):
+        super().train()
+        if self.merge_weights and self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def eval(self):
+        super().eval()
+        if self.merge_weights and not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if not is_mc2_valid():
+            if self.is_mp:
+                input_parallel = AllGatherOp.apply(x)
+            else:
+                input_parallel = x
+            result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name)
+        else:
+            result_mp = MC2ColumnSeqParallelLinear.apply(x, self.weight, self.model_parallel_group)
+            if self.bias is not None:
+                result_mp += self.bias
+
+        if not self.merged:
+            input_a = self.lora_dropout(x) @ self.lora_A
+            if not is_mc2_valid():
+                input_a = AllGatherOp.apply(input_a)
+                delta_mp = (input_a @ self.lora_B) * self.scaling
+            else:
+                input_a = MC2ColumnSeqParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group)
+                delta_mp = input_a * self.scaling
+            result_mp += delta_mp
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
 class LoRAMergedLinear(nn.Linear):
     # LoRA implemented in a dense layer  with merged linear weights for q, k, v
     def __init__(
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
index 1bbd0284823c..57d3bb3f2205 100644
--- a/paddlenlp/peft/lora/lora_model.py
+++ b/paddlenlp/peft/lora/lora_model.py
@@ -48,10 +48,12 @@
 from .lora_layers import (
     ColumnParallelLoRALinear,
     ColumnParallelLoRAMergedLinear,
+    ColumnSequenceParallelLoRALinear,
     LoRAConv2D,
     LoRALinear,
     LoRAMergedLinear,
     RowParallelLoRALinear,
+    RowSequenceParallelLoRALinear,
 )
 
 try:
@@ -73,6 +75,19 @@
     ColumnParallelQuantizationLoRALinear = None
     RowParallelQuantizationLoRALinear = None
 
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+
+    class ColumnSequenceParallelLinear:
+        pass
+
+    class RowSequenceParallelLinear:
+        pass
+
 
 class LoRAModel(nn.Layer):
     # TODO:lugimzzz support restore in following PR
@@ -454,6 +469,60 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora)
                 # Lora column parallel will spilt lora A matrix
                 self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
 
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+            elif isinstance(module, ColumnSequenceParallelLinear):
+                # recover the original output_features
+                output_features = module.weight.shape[1] * module.world_size
+                lora_module = ColumnSequenceParallelLoRALinear(
+                    in_features=module.weight.shape[0],
+                    out_features=output_features,
+                    gather_output=module.gather_output,
+                    has_bias=module.bias is not None,
+                    r=lora_config.r,
+                    lora_alpha=lora_config.lora_alpha,
+                    lora_dropout=lora_config.lora_dropout,
+                    rslora=lora_config.rslora,
+                    lora_plus_scale=lora_config.lora_plus_scale,
+                    pissa=lora_config.pissa,
+                    merge_weights=lora_config.merge_weights,
+                    lora_A_weight_attr=paddle.ParamAttr(
+                        initializer=nn.initializer.KaimingUniform(
+                            negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                        )
+                    ),
+                    use_quick_lora=lora_config.use_quick_lora,
+                )
+                # Lora column parallel will spilt lora B matrix
+                self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
+
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+            elif isinstance(module, RowSequenceParallelLinear):
+                # recover the original output_features
+                lora_module = RowSequenceParallelLoRALinear(
+                    in_features=module.weight.shape[0] * module.world_size,
+                    out_features=module.weight.shape[1],
+                    has_bias=module.bias is not None,
+                    input_is_parallel=module.input_is_parallel,
+                    r=lora_config.r,
+                    lora_alpha=lora_config.lora_alpha,
+                    lora_dropout=lora_config.lora_dropout,
+                    rslora=lora_config.rslora,
+                    lora_plus_scale=lora_config.lora_plus_scale,
+                    pissa=lora_config.pissa,
+                    merge_weights=lora_config.merge_weights,
+                    use_quick_lora=lora_config.use_quick_lora,
+                )
+                # Lora column parallel will spilt lora A matrix
+                self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
+
                 # for lora qat
                 if self.lora_config.do_qat:
                     self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
@@ -597,6 +666,8 @@ def mark_only_lora_as_trainable(self) -> None:
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRALinear)
                 or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAMergedLinear)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
@@ -684,9 +755,11 @@ def restore_original_model(self):
                 self._find_and_restore_module(layer_name)
             elif (
                 isinstance(layer, ColumnParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
                 or (
                     ColumnParallelQuantizationLoRALinear is not None

From d4062e576dfa9b77063a273b2410051411f279b6 Mon Sep 17 00:00:00 2001
From: yujun <573009727@qq.com>
Date: Tue, 23 Apr 2024 15:30:50 +0800
Subject: [PATCH 06/27] Revert "fix import"

This reverts commit 7314063128336138191f04c332a34929901f810c.
---
 paddlenlp/peft/lora/lora_layers.py | 272 +----------------------------
 paddlenlp/peft/lora/lora_model.py  |  73 --------
 2 files changed, 1 insertion(+), 344 deletions(-)

diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
index 66a0d0c0f520..e0c79c47a87a 100644
--- a/paddlenlp/peft/lora/lora_layers.py
+++ b/paddlenlp/peft/lora/lora_layers.py
@@ -27,44 +27,11 @@
 
 from .lora_quick_layers import quick_lora
 
-
-def is_mc2_valid():
-    return "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0"))
-
-
-if is_mc2_valid():
-    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-        MC2ColumnSeqParallelLinear,
-        MC2RowSeqParallelLinear,
-    )
-
+if "npu" in paddle.device.get_all_custom_device_type():
     from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear
 else:
     MC2LoRaRowParallelLinear = None
     MC2LoRaColumnParallelLinear = None
-    MC2ColumnSeqParallelLinear = None
-    MC2RowSeqParallelLinear = None
-
-
-try:
-    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-        AllGatherOp,
-        ColumnSequenceParallelLinear,
-        ReduceScatterOp,
-        RowSequenceParallelLinear,
-        mark_as_sequence_parallel_parameter,
-    )
-except:
-
-    class ColumnSequenceParallelLinear:
-        pass
-
-    class RowSequenceParallelLinear:
-        pass
-
-    AllGatherOp = None
-    ReduceScatterOp = None
-    mark_as_sequence_parallel_parameter = None
 
 
 class LoRALinear(nn.Linear):
@@ -331,123 +298,6 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
-class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        rslora: bool = False,
-        lora_plus_scale: float = 1.0,
-        merge_weights: bool = True,
-        use_quick_lora: bool = False,
-        pissa: bool = False,
-        **kwargs
-    ):
-        RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
-        if not isinstance(r, int) or r <= 0:
-            raise ValueError("Lora rank r should be a positive integer")
-        if pissa:
-            raise ValueError("Pissa is not supported in model parallel by now")
-        self.r = r
-        self.lora_alpha = lora_alpha
-        # Optional dropout
-        if lora_dropout > 0.0:
-            self.lora_dropout = nn.Dropout(p=lora_dropout)
-        else:
-            self.lora_dropout = lambda x: x
-        # Mark the weight as unmerged
-        self.merged = False
-        self.merge_weights = merge_weights
-
-        # compatible
-        self.name = self._name
-
-        # Actual trainable parameters
-        self.lora_A = self.create_parameter(
-            shape=[self.input_size_per_partition, r],
-            dtype=self._dtype,
-            is_bias=False,
-            attr=paddle.ParamAttr(
-                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
-            ),
-        )
-        self.lora_B = self.create_parameter(
-            shape=[r, self.out_features],
-            dtype=self._dtype,
-            is_bias=False,
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0),
-                learning_rate=lora_plus_scale,
-            ),
-        )
-
-        self.lora_A.is_distributed = True
-        self.lora_A.split_axis = 0
-        self.lora_B.is_distributed = False
-        mark_as_sequence_parallel_parameter(self.lora_B)
-        if not rslora:
-            self.scaling = self.lora_alpha / self.r
-        else:
-            self.scaling = self.lora_alpha / math.sqrt(self.r)
-
-        # Freezing the pre-trained weight matrix
-        self.weight.stop_gradient = True
-        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
-
-    @property
-    def use_quick_lora(self):
-        # TODO(@gexiao): support qlora
-        return False  # self._use_quick_lora and self.training and not self.merged
-
-    def train(self):
-        super().train()
-        if self.merge_weights and self.merged:
-            # Make sure that the weights are not merged
-            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
-            self.weight.set_value(new_weight)
-            self.merged = False
-
-    def eval(self):
-        super().eval()
-        if self.merge_weights and not self.merged:
-            # Merge the weights and mark it
-            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
-            self.weight.set_value(new_weight)
-            self.merged = True
-
-    def forward(self, x: paddle.Tensor):
-        if not self.input_is_parallel:
-            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
-        else:
-            input_mp = x
-
-        if not is_mc2_valid():
-            output_parallel = self.linear(input_mp, self.weight, name=self._name)
-            output_ = ReduceScatterOp.apply(output_parallel)
-            result_mp = output_ + self.bias if self.bias is not None else output_
-        else:
-            output_ = MC2RowSeqParallelLinear.apply(input_mp, self.weight, self.model_parallel_group)
-            result_mp = output_ + self.bias if self.bias is not None else output_
-
-        if not self.merged:
-            input_mp = self.lora_dropout(input_mp)
-            if not is_mc2_valid():
-                input_mp = input_mp @ self.lora_A
-                input_mp = ReduceScatterOp.apply(input_mp)
-            else:
-                input_mp = MC2RowSeqParallelLinear.apply(input_mp, self.lora_A, self.model_parallel_group)
-            delta_mp = (input_mp @ self.lora_B) * self.scaling
-            result_mp += delta_mp
-        return result_mp
-
-    def extra_repr(self):
-        name = f", name={self.name}" if self.name else ""
-        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
-
-
 class ColumnParallelLoRALinear(ColumnParallelLinear):
     def __init__(
         self,
@@ -578,126 +428,6 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
-class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        rslora: bool = False,
-        lora_plus_scale: float = 1.0,
-        merge_weights: bool = True,
-        lora_A_weight_attr: Optional[paddle.ParamAttr] = None,
-        use_quick_lora: bool = False,
-        pissa: bool = False,
-        **kwargs
-    ):
-        ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
-        if not isinstance(r, int) or r <= 0:
-            raise ValueError("Lora rank r should be a positive integer")
-        if pissa:
-            raise ValueError("Pissa is not supported in model parallel by now")
-        self.r = r
-        self.lora_alpha = lora_alpha
-        # Optional dropout
-        if lora_dropout > 0.0:
-            self.lora_dropout = nn.Dropout(p=lora_dropout)
-        else:
-            self.lora_dropout = lambda x: x
-        # Mark the weight as unmerged
-        self.merged = False
-        self.merge_weights = merge_weights
-
-        # compatible
-        self.name = self._name
-
-        # Actual trainable parameters
-        self.lora_A = self.create_parameter(
-            shape=[in_features, r],
-            dtype=self._dtype,
-            is_bias=False,
-            attr=lora_A_weight_attr,
-        )
-        self.lora_A.is_distributed = False
-        mark_as_sequence_parallel_parameter(self.lora_A)
-
-        self.lora_B = self.create_parameter(
-            shape=[r, self.output_size_per_partition],
-            dtype=self._dtype,
-            is_bias=False,
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0),
-                learning_rate=lora_plus_scale,
-            ),
-        )
-
-        self.lora_B.is_distributed = True
-        self.lora_B.split_axis = 1
-        if not rslora:
-            self.scaling = self.lora_alpha / self.r
-        else:
-            self.scaling = self.lora_alpha / math.sqrt(self.r)
-
-        # Freezing the pre-trained weight matrix
-        self.weight.stop_gradient = True
-        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
-
-    @property
-    def use_quick_lora(self):
-        # TODO(@gexiao): support qlora
-        return False  # self._use_quick_lora and self.training and not self.merged
-
-    def train(self):
-        super().train()
-        if self.merge_weights and self.merged:
-            # Make sure that the weights are not merged
-            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
-            self.weight.set_value(new_weight)
-            self.merged = False
-
-    def eval(self):
-        super().eval()
-        if self.merge_weights and not self.merged:
-            # Merge the weights and mark it
-            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
-            self.weight.set_value(new_weight)
-            self.merged = True
-
-    def forward(self, x: paddle.Tensor):
-        if not is_mc2_valid():
-            if self.is_mp:
-                input_parallel = AllGatherOp.apply(x)
-            else:
-                input_parallel = x
-            result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name)
-        else:
-            result_mp = MC2ColumnSeqParallelLinear.apply(x, self.weight, self.model_parallel_group)
-            if self.bias is not None:
-                result_mp += self.bias
-
-        if not self.merged:
-            input_a = self.lora_dropout(x) @ self.lora_A
-            if not is_mc2_valid():
-                input_a = AllGatherOp.apply(input_a)
-                delta_mp = (input_a @ self.lora_B) * self.scaling
-            else:
-                input_a = MC2ColumnSeqParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group)
-                delta_mp = input_a * self.scaling
-            result_mp += delta_mp
-
-        if self.gather_output and self.is_mp:
-            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
-        else:
-            result = result_mp
-        return result
-
-    def extra_repr(self):
-        name = f", name={self.name}" if self.name else ""
-        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
-
-
 class LoRAMergedLinear(nn.Linear):
     # LoRA implemented in a dense layer  with merged linear weights for q, k, v
     def __init__(
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
index 57d3bb3f2205..1bbd0284823c 100644
--- a/paddlenlp/peft/lora/lora_model.py
+++ b/paddlenlp/peft/lora/lora_model.py
@@ -48,12 +48,10 @@
 from .lora_layers import (
     ColumnParallelLoRALinear,
     ColumnParallelLoRAMergedLinear,
-    ColumnSequenceParallelLoRALinear,
     LoRAConv2D,
     LoRALinear,
     LoRAMergedLinear,
     RowParallelLoRALinear,
-    RowSequenceParallelLoRALinear,
 )
 
 try:
@@ -75,19 +73,6 @@
     ColumnParallelQuantizationLoRALinear = None
     RowParallelQuantizationLoRALinear = None
 
-try:
-    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-        ColumnSequenceParallelLinear,
-        RowSequenceParallelLinear,
-    )
-except:
-
-    class ColumnSequenceParallelLinear:
-        pass
-
-    class RowSequenceParallelLinear:
-        pass
-
 
 class LoRAModel(nn.Layer):
     # TODO:lugimzzz support restore in following PR
@@ -469,60 +454,6 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora)
                 # Lora column parallel will spilt lora A matrix
                 self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
 
-                # for lora qat
-                if self.lora_config.do_qat:
-                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
-                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
-                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
-            elif isinstance(module, ColumnSequenceParallelLinear):
-                # recover the original output_features
-                output_features = module.weight.shape[1] * module.world_size
-                lora_module = ColumnSequenceParallelLoRALinear(
-                    in_features=module.weight.shape[0],
-                    out_features=output_features,
-                    gather_output=module.gather_output,
-                    has_bias=module.bias is not None,
-                    r=lora_config.r,
-                    lora_alpha=lora_config.lora_alpha,
-                    lora_dropout=lora_config.lora_dropout,
-                    rslora=lora_config.rslora,
-                    lora_plus_scale=lora_config.lora_plus_scale,
-                    pissa=lora_config.pissa,
-                    merge_weights=lora_config.merge_weights,
-                    lora_A_weight_attr=paddle.ParamAttr(
-                        initializer=nn.initializer.KaimingUniform(
-                            negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
-                        )
-                    ),
-                    use_quick_lora=lora_config.use_quick_lora,
-                )
-                # Lora column parallel will spilt lora B matrix
-                self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
-
-                # for lora qat
-                if self.lora_config.do_qat:
-                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
-                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
-                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
-            elif isinstance(module, RowSequenceParallelLinear):
-                # recover the original output_features
-                lora_module = RowSequenceParallelLoRALinear(
-                    in_features=module.weight.shape[0] * module.world_size,
-                    out_features=module.weight.shape[1],
-                    has_bias=module.bias is not None,
-                    input_is_parallel=module.input_is_parallel,
-                    r=lora_config.r,
-                    lora_alpha=lora_config.lora_alpha,
-                    lora_dropout=lora_config.lora_dropout,
-                    rslora=lora_config.rslora,
-                    lora_plus_scale=lora_config.lora_plus_scale,
-                    pissa=lora_config.pissa,
-                    merge_weights=lora_config.merge_weights,
-                    use_quick_lora=lora_config.use_quick_lora,
-                )
-                # Lora column parallel will spilt lora A matrix
-                self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
-
                 # for lora qat
                 if self.lora_config.do_qat:
                     self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
@@ -666,8 +597,6 @@ def mark_only_lora_as_trainable(self) -> None:
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRALinear)
                 or isinstance(layer, RowParallelLoRALinear)
-                or isinstance(layer, ColumnSequenceParallelLoRALinear)
-                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAMergedLinear)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
@@ -755,11 +684,9 @@ def restore_original_model(self):
                 self._find_and_restore_module(layer_name)
             elif (
                 isinstance(layer, ColumnParallelLoRALinear)
-                or isinstance(layer, ColumnSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or isinstance(layer, RowParallelLoRALinear)
-                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
                 or (
                     ColumnParallelQuantizationLoRALinear is not None

From 590cee9812d052e8664b2551133af485534ebac8 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 23 Apr 2024 19:22:35 +0800
Subject: [PATCH 07/27] Support Llama3  (#8315)

* support llama-3

* Add llama-3 tokenizer

* fix for llama3
---
 llm/finetune_generation.py                    |   3 +-
 paddlenlp/transformers/auto/tokenizer.py      |  21 +-
 paddlenlp/transformers/llama/configuration.py |   2 +
 paddlenlp/transformers/llama/modeling.py      |   9 +-
 paddlenlp/transformers/llama/tokenizer.py     | 289 +++++++++++++++++-
 5 files changed, 312 insertions(+), 12 deletions(-)

diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py
index 3a4def7db46c..df7a22a0cb95 100644
--- a/llm/finetune_generation.py
+++ b/llm/finetune_generation.py
@@ -45,6 +45,7 @@
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
+    Llama3Tokenizer,
     LlamaTokenizer,
 )
 from paddlenlp.utils.log import logger
@@ -232,7 +233,7 @@ def neft_post_hook(module, input, output):
     if tokenizer.chat_template is not None:
         data_args.eval_with_do_generation = False
 
-    if isinstance(tokenizer, LlamaTokenizer):
+    if isinstance(tokenizer, LlamaTokenizer) or isinstance(tokenizer, Llama3Tokenizer):
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
     if data_args.dataset_name_or_path is None:
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
index 451468741ea1..083ab3037311 100644
--- a/paddlenlp/transformers/auto/tokenizer.py
+++ b/paddlenlp/transformers/auto/tokenizer.py
@@ -189,13 +189,20 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
             init_class = init_kwargs.pop("tokenizer_class", None)
 
         if init_class:
-            class_name = cls._name_mapping[init_class]
-            import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
-            tokenizer_class = getattr(import_class, init_class)
-            if use_fast:
-                fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
-                tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
-            return tokenizer_class
+            if init_class in cls._name_mapping:
+                class_name = cls._name_mapping[init_class]
+                import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+                tokenizer_class = getattr(import_class, init_class)
+                if use_fast:
+                    fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
+                    tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
+                return tokenizer_class
+            else:
+                import_class = import_module("paddlenlp.transformers")
+                tokenizer_class = getattr(import_class, init_class, None)
+                assert tokenizer_class is not None, f"Can't find tokenizer {init_class}"
+                return tokenizer_class
+
         # If no `init_class`, we use pattern recognition to recognize the tokenizer class.
         else:
             # TODO: Potential issue https://github.com/PaddlePaddle/PaddleNLP/pull/3786#discussion_r1024689810
diff --git a/paddlenlp/transformers/llama/configuration.py b/paddlenlp/transformers/llama/configuration.py
index 68459f025fe4..e0b051b7434f 100644
--- a/paddlenlp/transformers/llama/configuration.py
+++ b/paddlenlp/transformers/llama/configuration.py
@@ -147,6 +147,7 @@ def __init__(
         num_key_value_heads=None,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
+        rope_theta=10000.0,
         use_cache=True,
         use_recompute=False,
         recompute_granularity="full",
@@ -188,6 +189,7 @@ def __init__(
 
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
 
         self.use_cache = use_cache
         self.use_recompute = use_recompute
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index b0b08c30241a..5cb13f7aa61a 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -813,24 +813,28 @@ def _init_rope(self):
             self.rotary_emb = LlamaRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
             )
         elif self.config.rope_scaling_type == "linear":
             self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
             )
         elif self.config.rope_scaling_type == "ntk":
             self.rotary_emb = LlamaNTKScalingRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
             )
         elif self.config.rope_scaling_type == "dynamic_ntk":
             self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
             )
         else:
             raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}")
@@ -903,6 +907,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
+
             if self.reshard_layer is not None:
                 if self.sequence_parallel:
                     assert self.seq_length % self.config.sep_parallel_degree == 0
@@ -1027,7 +1032,6 @@ def forward(
             value_states = paddle.concat([past_key_value[1], value_states], axis=1)
 
         past_key_value = (key_states, value_states) if use_cache else None
-
         if self.kv_indices is not None:
             key_states = paddle.index_select(key_states, self.kv_indices, axis=2)
             value_states = paddle.index_select(value_states, self.kv_indices, axis=2)
@@ -1036,7 +1040,7 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         # paddle version > 2.6 or develop support flash-attn with gqa/mqa
         paddle_version = float(paddle.__version__[:3])
-        if (paddle_version != 0.0) and (paddle_version <= 2.6):
+        if not self.config.use_flash_attention or ((paddle_version != 0.0) and (paddle_version <= 2.6)):
             key_states = repeat_kv(key_states, self.num_key_value_groups)
             value_states = repeat_kv(value_states, self.num_key_value_groups)
 
@@ -1560,7 +1564,6 @@ def forward(
             else:
                 attention_mask = attention_mask.astype("bool")
         hidden_states = inputs_embeds
-
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py
index 4efaa48f797c..6f19530c05cb 100644
--- a/paddlenlp/transformers/llama/tokenizer.py
+++ b/paddlenlp/transformers/llama/tokenizer.py
@@ -24,7 +24,7 @@
 from .. import PretrainedTokenizer
 from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
 
-__all__ = ["LlamaTokenizer"]
+__all__ = ["LlamaTokenizer", "Llama3Tokenizer"]
 
 
 class LlamaTokenizer(PretrainedTokenizer):
@@ -199,6 +199,7 @@ def create_token_type_ids_from_sequences(
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
         use of token type ids, therefore a list of zeros is returned.
+
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs.
@@ -270,3 +271,289 @@ def _pad(
                         constant_values=0,
                     )
         return encoded_inputs
+
+
+"""Copied Tokenization classes for QWen."""
+
+import base64
+import unicodedata
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
+
+from ...utils.import_utils import is_tiktoken_available
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+BEGINOFTEXT = "<|begin_of_text|>"
+ENDOFTEXT = "<|end_of_text|>"
+IMSTART = "<|start_header_id|>"
+IMEND = "<|end_header_id|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250)))
+SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:]
+
+tiktoken = None
+
+
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+
+
+class Llama3Tokenizer(PretrainedTokenizer):
+    """QWen tokenizer."""
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        padding_side="left",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not is_tiktoken_available():
+            raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken")
+
+        import tiktoken as tk
+
+        tiktoken = tk
+
+        self.errors = errors  # how to handle errors in decoding
+
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks))
+        }
+        enc = tiktoken.Encoding(
+            "Llama3",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+
+        self.decoder = {v: k for k, v in self.mergeable_ranks.items()}  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+
+        self.tokenizer = enc  # type: tiktoken.Encoding
+
+        self.eod_id = self.special_tokens[ENDOFTEXT]
+        self.start_header_id = self.special_tokens[IMSTART]
+        self.end_header_id = self.special_tokens[IMEND]
+
+        if "pad_token_id" in kwargs:
+            self.pad_token_id = kwargs["pad_token_id"]
+        if "eos_token_id" in kwargs:
+            self.eos_token_id = kwargs["eos_token_id"]
+
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+
+    def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "tokenizer.model")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
+            tokens.append(self.decoder[t])
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs

From 871070d95e9a48aaa47ebf5f7936532e856c02fa Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 24 Apr 2024 11:33:53 +0800
Subject: [PATCH 08/27] bug fixer (#8314) (#8318)

---
 paddlenlp/trainer/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index d8487364076b..3e8fc333fe95 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -1011,6 +1011,7 @@ def _inner_training_loop(
                     self.timers and self.timers("optimizer-step").start()
 
                     if self.args.gradient_accumulation_steps > 1 and self._enable_delay_scale_loss():
+                        paddle.device.synchronize()
                         for p in model._layers.parameters():
                             with paddle.no_grad():
                                 if hasattr(p, "main_grad") and p.main_grad is not None:

From 0f428bbe47daed3cd861f7047c3e9acbec4ea0b1 Mon Sep 17 00:00:00 2001
From: Tian <121000916+SylarTiaNII@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:18:11 +0800
Subject: [PATCH 09/27] [Distributed] [CustomDevices] Adapt SP on lora &&
 polish MC2 APIs (#8303)

* [Distributed] adapt sequence parallel on LoRA (#8235)

* [Distributed] [CustomDevices] adapt lora sp && polish MC2 APIs
---
 paddlenlp/peft/lora/lora_layers.py            | 278 ++++++++++++++++--
 paddlenlp/peft/lora/lora_model.py             |  85 +++++-
 paddlenlp/peft/lora/mc2_lora_npu.py           |  80 -----
 paddlenlp/transformers/llama/modeling.py      |  25 +-
 paddlenlp/transformers/mc2_parallel_linear.py | 230 +++++++++++++++
 .../mc2_seqence_parallel_linear.py            | 146 ---------
 6 files changed, 572 insertions(+), 272 deletions(-)
 delete mode 100644 paddlenlp/peft/lora/mc2_lora_npu.py
 create mode 100644 paddlenlp/transformers/mc2_parallel_linear.py
 delete mode 100644 paddlenlp/transformers/mc2_seqence_parallel_linear.py

diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
index e0c79c47a87a..a31f7c3a33b1 100644
--- a/paddlenlp/peft/lora/lora_layers.py
+++ b/paddlenlp/peft/lora/lora_layers.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import math
-import os
 from typing import List, Optional
 
 import paddle
@@ -25,13 +24,25 @@
     RowParallelLinear,
 )
 
-from .lora_quick_layers import quick_lora
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        AllGatherOp,
+        ColumnSequenceParallelLinear,
+        ReduceScatterOp,
+        RowSequenceParallelLinear,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from paddlenlp.transformers.mc2_parallel_linear import (
+    MC2ColumnParallelCoreLinear,
+    MC2ColumnSeqParallelCoreLinear,
+    MC2RowParallelCoreLinear,
+    MC2RowSeqParallelCoreLinear,
+)
 
-if "npu" in paddle.device.get_all_custom_device_type():
-    from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear
-else:
-    MC2LoRaRowParallelLinear = None
-    MC2LoRaColumnParallelLinear = None
+from .lora_quick_layers import quick_lora
 
 
 class LoRALinear(nn.Linear):
@@ -266,9 +277,7 @@ def forward(self, x: paddle.Tensor):
             )
         else:
             # x @ W : [bz, in_f / ws] ===> [bz, out_f]
-            if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")):
-                output = MC2LoRaRowParallelLinear.apply(input_mp, self.weight, self.model_parallel_group)
-            else:
+            if MC2RowParallelCoreLinear is None:
                 result_mp = F.linear(x=input_mp, weight=self.weight, name=self.name)
                 output = mp_ops._mp_allreduce(
                     result_mp,
@@ -276,6 +285,8 @@ def forward(self, x: paddle.Tensor):
                     use_calc_stream=True,
                     use_model_parallel=True,
                 )
+            else:
+                output = MC2RowParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group)
 
             if not self.merged:
                 # x @ A: [bz, in_f/ ws] ===> [bz, r]
@@ -298,6 +309,120 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        merge_weights: bool = True,
+        use_quick_lora: bool = False,
+        **kwargs
+    ):
+        RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[self.input_size_per_partition, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+            ),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, self.out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_A.is_distributed = True
+        self.lora_A.split_axis = 0
+        self.lora_B.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_B)
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def train(self):
+        super().train()
+        if self.merge_weights and self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def eval(self):
+        super().eval()
+        if self.merge_weights and not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
+        else:
+            input_mp = x
+
+        if MC2RowSeqParallelCoreLinear is None:
+            output_parallel = self.linear(input_mp, self.weight, name=self._name)
+            output_ = ReduceScatterOp.apply(output_parallel)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+        else:
+            output_ = MC2RowSeqParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+
+        if not self.merged:
+            input_mp = self.lora_dropout(input_mp)
+            if MC2RowSeqParallelCoreLinear is None:
+                input_mp = input_mp @ self.lora_A
+                input_mp = ReduceScatterOp.apply(input_mp)
+            else:
+                input_mp = MC2RowSeqParallelCoreLinear.apply(input_mp, self.lora_A, self.model_parallel_group)
+            delta_mp = (input_mp @ self.lora_B) * self.scaling
+            result_mp += delta_mp
+        return result_mp
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
 class ColumnParallelLoRALinear(ColumnParallelLinear):
     def __init__(
         self,
@@ -400,21 +525,21 @@ def forward(self, input: paddle.Tensor):
                 world_size=self.world_size,
             )
         else:
-            if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")):
-                res_mp = MC2LoRaColumnParallelLinear.apply(input, self.weight, self.model_parallel_group)
-                result_mp = res_mp + self.bias
-            else:
+            if MC2ColumnParallelCoreLinear is None:
                 input_mp = mp_ops._c_identity(input, group=self.model_parallel_group)
                 result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name)
+            else:
+                res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group)
+                result_mp = res_mp + self.bias
 
             if not self.merged:
                 input_a = self.lora_dropout(input) @ self.lora_A
-                if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")):
-                    tmp = MC2LoRaColumnParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group)
-                    delta_mp = tmp * self.scaling
-                else:
+                if MC2ColumnParallelCoreLinear is None:
                     input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group)
                     delta_mp = (input_a_mp @ self.lora_B) * self.scaling
+                else:
+                    tmp = MC2ColumnParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group)
+                    delta_mp = tmp * self.scaling
                 result_mp += delta_mp
 
         if self.gather_output and self.is_mp:
@@ -428,6 +553,123 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        merge_weights: bool = True,
+        lora_A_weight_attr: Optional[paddle.ParamAttr] = None,
+        use_quick_lora: bool = False,
+        **kwargs
+    ):
+        ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=lora_A_weight_attr,
+        )
+        self.lora_A.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_A)
+
+        self.lora_B = self.create_parameter(
+            shape=[r, self.output_size_per_partition],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_B.is_distributed = True
+        self.lora_B.split_axis = 1
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def train(self):
+        super().train()
+        if self.merge_weights and self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def eval(self):
+        super().eval()
+        if self.merge_weights and not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if MC2ColumnSeqParallelCoreLinear is None:
+            if self.is_mp:
+                input_parallel = AllGatherOp.apply(x)
+            else:
+                input_parallel = x
+            result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name)
+        else:
+            result_mp = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            if self.bias is not None:
+                result_mp += self.bias
+
+        if not self.merged:
+            input_a = self.lora_dropout(x) @ self.lora_A
+            if MC2ColumnSeqParallelCoreLinear is None:
+                input_a = AllGatherOp.apply(input_a)
+                delta_mp = (input_a @ self.lora_B) * self.scaling
+            else:
+                input_a = MC2ColumnSeqParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group)
+                delta_mp = input_a * self.scaling
+            result_mp += delta_mp
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
 class LoRAMergedLinear(nn.Linear):
     # LoRA implemented in a dense layer  with merged linear weights for q, k, v
     def __init__(
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
index 1bbd0284823c..41ab1e681e24 100644
--- a/paddlenlp/peft/lora/lora_model.py
+++ b/paddlenlp/peft/lora/lora_model.py
@@ -45,14 +45,25 @@
 from ...utils.env import LORA_WEIGHTS_NAME, SAFE_PEFT_WEIGHTS_INDEX_NAME
 from ...utils.log import logger
 from .lora_config import LoRAConfig
-from .lora_layers import (
-    ColumnParallelLoRALinear,
-    ColumnParallelLoRAMergedLinear,
-    LoRAConv2D,
-    LoRALinear,
-    LoRAMergedLinear,
-    RowParallelLoRALinear,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+
+    from .lora_layers import (
+        ColumnParallelLoRALinear,
+        ColumnParallelLoRAMergedLinear,
+        ColumnSequenceParallelLoRALinear,
+        LoRAConv2D,
+        LoRALinear,
+        LoRAMergedLinear,
+        RowParallelLoRALinear,
+        RowSequenceParallelLoRALinear,
+    )
+except:
+    pass
 
 try:
     from ...quantization.quantization_linear import (
@@ -454,6 +465,58 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora)
                 # Lora column parallel will spilt lora A matrix
                 self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
 
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+            elif isinstance(module, ColumnSequenceParallelLinear):
+                # recover the original output_features
+                output_features = module.weight.shape[1] * module.world_size
+                lora_module = ColumnSequenceParallelLoRALinear(
+                    in_features=module.weight.shape[0],
+                    out_features=output_features,
+                    gather_output=module.gather_output,
+                    has_bias=module.bias is not None,
+                    r=lora_config.r,
+                    lora_alpha=lora_config.lora_alpha,
+                    lora_dropout=lora_config.lora_dropout,
+                    rslora=lora_config.rslora,
+                    lora_plus_scale=lora_config.lora_plus_scale,
+                    merge_weights=lora_config.merge_weights,
+                    lora_A_weight_attr=paddle.ParamAttr(
+                        initializer=nn.initializer.KaimingUniform(
+                            negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                        )
+                    ),
+                    use_quick_lora=lora_config.use_quick_lora,
+                )
+                # Lora column parallel will spilt lora B matrix
+                self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
+
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+            elif isinstance(module, RowSequenceParallelLinear):
+                # recover the original output_features
+                lora_module = RowSequenceParallelLoRALinear(
+                    in_features=module.weight.shape[0] * module.world_size,
+                    out_features=module.weight.shape[1],
+                    has_bias=module.bias is not None,
+                    input_is_parallel=module.input_is_parallel,
+                    r=lora_config.r,
+                    lora_alpha=lora_config.lora_alpha,
+                    lora_dropout=lora_config.lora_dropout,
+                    rslora=lora_config.rslora,
+                    lora_plus_scale=lora_config.lora_plus_scale,
+                    merge_weights=lora_config.merge_weights,
+                    use_quick_lora=lora_config.use_quick_lora,
+                )
+                # Lora column parallel will spilt lora A matrix
+                self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
+
                 # for lora qat
                 if self.lora_config.do_qat:
                     self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
@@ -539,7 +602,7 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora)
                 )
         if lora_module is None:
             raise ValueError(
-                f"LoRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear. {module}({module_name}) is not supported。"
+                f"LoRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear or paddlenlp.transformers.sequence_utils. {module}({module_name} {type(module).__name__}) is not supported。"
             )
         if getattr(lora_module, "quant_weight", None) is not None:
             lora_module.quant_weight = module.quant_weight
@@ -597,6 +660,8 @@ def mark_only_lora_as_trainable(self) -> None:
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRALinear)
                 or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAMergedLinear)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
@@ -684,9 +749,11 @@ def restore_original_model(self):
                 self._find_and_restore_module(layer_name)
             elif (
                 isinstance(layer, ColumnParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
                 or isinstance(layer, LoRAConv2D)
                 or isinstance(layer, ColumnParallelLoRAMergedLinear)
                 or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
                 or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
                 or (
                     ColumnParallelQuantizationLoRALinear is not None
diff --git a/paddlenlp/peft/lora/mc2_lora_npu.py b/paddlenlp/peft/lora/mc2_lora_npu.py
deleted file mode 100644
index 7ae47b1496f7..000000000000
--- a/paddlenlp/peft/lora/mc2_lora_npu.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# !/usr/bin/env python3
-
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" mc2(tp overlap) """
-
-import paddle
-import paddle_custom_device
-from paddle.autograd import PyLayer
-
-
-class MC2LoRaRowParallelLinear(PyLayer):
-    @staticmethod
-    def forward(ctx, input_, weight, group):
-        ctx.save_for_backward(input_, weight)
-        rank = paddle.distributed.get_rank()
-        hcom_name = group.process_group.get_comm_name(rank)
-        x = input_.reshape([-1, input_.shape[-1]])
-        out = paddle_custom_device.npu.fused_mm_allreduce(
-            x, weight, bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
-        )
-        output = out.reshape([input_.shape[0], input_.shape[1], weight.shape[1]])
-        ctx.ring_id = group.id
-        return output
-
-    @staticmethod
-    def backward(ctx, dy):
-        input_, weight = ctx.saved_tensor()
-        out_grad = dy
-        sub_grad = out_grad.reshape([-1, out_grad.shape[-1]])
-        input_grad = paddle.matmul(sub_grad, weight, transpose_y=True)
-        if weight.stop_gradient:
-            return input_grad.reshape(input_.shape)
-        else:
-            input_reshape = input_.reshape([-1, input_.shape[-1]])
-            weight_grad = paddle.matmul(input_reshape, sub_grad, transpose_x=True)
-            return input_grad.reshape(input_.shape), weight_grad
-
-
-class MC2LoRaColumnParallelLinear(PyLayer):
-    @staticmethod
-    def forward(ctx, input_, weight, group):
-        ctx.save_for_backward(input_, weight)
-        ctx.group = group
-        input_mp = input_
-        result_mp = paddle.matmul(input_mp, weight)
-        return result_mp
-
-    @staticmethod
-    def backward(ctx, dy):
-        input_, weight = ctx.saved_tensor()
-        sub_grad = dy.reshape([-1, dy.shape[-1]])
-        rank = paddle.distributed.get_rank()
-        hcom_name = ctx.group.process_group.get_comm_name(rank)
-
-        d_weight = (
-            paddle.matmul(input_.reshape([-1, input_.shape[-1]]), sub_grad, transpose_x=True)
-            if not weight.stop_gradient
-            else None
-        )
-        d_input = paddle_custom_device.npu.fused_mm_allreduce(
-            sub_grad, weight.t(), bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
-        )
-
-        if d_weight is not None:
-            return d_input.reshape(input_.shape), d_weight
-        else:
-            return d_input.reshape(input_.shape)
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 5cb13f7aa61a..38f1d244bdf2 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -62,6 +62,10 @@ def swiglu(x, y=None):
     init_name_mappings,
 )
 from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+from paddlenlp.transformers.mc2_parallel_linear import (
+    MC2ColumnSeqParallelLinear,
+    MC2RowSeqParallelLinear,
+)
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -96,13 +100,6 @@ def swiglu(x, y=None):
 ]
 
 
-def is_mc2_valid():
-    current_device = get_env_device()
-    if current_device == "npu":
-        return True
-    return False
-
-
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -574,12 +571,7 @@ def __init__(self, config):
         self.fuse_attention_ffn = config.fuse_attention_ffn
 
         if config.sequence_parallel:
-            if is_mc2_valid and int(os.getenv("FLAGS_NPU_MC2", 0)):
-                from paddlenlp.transformers.mc2_seqence_parallel_linear import (
-                    MC2ColumnSeqParallelLinear,
-                    MC2RowSeqParallelLinear,
-                )
-
+            if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
                 ColumnParallelLinear = MC2ColumnSeqParallelLinear
                 RowParallelLinear = MC2RowSeqParallelLinear
             else:
@@ -697,12 +689,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                 self.use_fused_rope = False
 
         if config.sequence_parallel:
-            if is_mc2_valid and int(os.getenv("FLAGS_NPU_MC2", 0)):
-                from paddlenlp.transformers.mc2_seqence_parallel_linear import (
-                    MC2ColumnSeqParallelLinear,
-                    MC2RowSeqParallelLinear,
-                )
-
+            if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
                 ColumnParallelLinear = MC2ColumnSeqParallelLinear
                 RowParallelLinear = MC2RowSeqParallelLinear
             else:
diff --git a/paddlenlp/transformers/mc2_parallel_linear.py b/paddlenlp/transformers/mc2_parallel_linear.py
new file mode 100644
index 000000000000..066e8074e21f
--- /dev/null
+++ b/paddlenlp/transformers/mc2_parallel_linear.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+
+try:
+    import paddle_custom_device
+except ImportError:
+    pass
+
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+    pass
+from paddlenlp.utils.tools import get_env_device
+
+__all_gather_recomputation__ = False
+if int(os.getenv("MC2_Recompute", 0)):
+    __all_gather_recomputation__ = True
+
+
+def is_mc2_valid():
+    current_device = get_env_device()
+    if current_device == "npu":
+        return int(os.getenv("MC2", 0))
+    return 0
+
+
+if is_mc2_valid():
+
+    class MC2ColumnParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.save_for_backward(input_, weight)
+            ctx.group = group
+            input_mp = input_
+            result_mp = paddle.matmul(input_mp, weight)
+            return result_mp
+
+        @staticmethod
+        def backward(ctx, dy):
+            input_, weight = ctx.saved_tensor()
+            sub_grad = dy.reshape([-1, dy.shape[-1]])
+            rank = paddle.distributed.get_rank()
+            hcom_name = ctx.group.process_group.get_comm_name(rank)
+
+            d_weight = (
+                paddle.matmul(input_.reshape([-1, input_.shape[-1]]), sub_grad, transpose_x=True)
+                if not weight.stop_gradient
+                else None
+            )
+            d_input = paddle_custom_device.npu.fused_mm_allreduce(
+                sub_grad, weight.t(), bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
+            )
+
+            if d_weight is not None:
+                return d_input.reshape(input_.shape), d_weight
+            else:
+                return d_input.reshape(input_.shape), None
+
+    class MC2RowParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.save_for_backward(input_, weight)
+            rank = paddle.distributed.get_rank()
+            hcom_name = group.process_group.get_comm_name(rank)
+            x = input_.reshape([-1, input_.shape[-1]])
+            out = paddle_custom_device.npu.fused_mm_allreduce(
+                x, weight, bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
+            )
+            output = out.reshape([input_.shape[0], input_.shape[1], weight.shape[1]])
+            ctx.ring_id = group.id
+            return output
+
+        @staticmethod
+        def backward(ctx, dy):
+            input_, weight = ctx.saved_tensor()
+            out_grad = dy
+            sub_grad = out_grad.reshape([-1, out_grad.shape[-1]])
+            input_grad = paddle.matmul(sub_grad, weight, transpose_y=True)
+            if weight.stop_gradient:
+                return input_grad.reshape(input_.shape), None
+            else:
+                input_reshape = input_.reshape([-1, input_.shape[-1]])
+                weight_grad = paddle.matmul(input_reshape, sub_grad, transpose_x=True)
+                return input_grad.reshape(input_.shape), weight_grad
+
+    class MC2ColumnSeqParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.weight_stop_gradient = weight.stop_gradient
+            ctx.save_for_backward(input_, weight)
+
+            rank = dist.get_rank()
+            hcomm_info = group.process_group.get_comm_name(rank)
+
+            world_size = group.nranks
+            output, gather_out = paddle_custom_device.npu.fused_allgather_mm(
+                input_,
+                weight,
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                gather_index=0,
+                gather_output=(not __all_gather_recomputation__),
+                comm_turn=0,
+            )
+
+            ctx.all_gather_output = gather_out
+            ctx.world_size = world_size
+            ctx.group = group
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input_, weight = ctx.saved_tensor()
+
+            if __all_gather_recomputation__:
+                dim_size = input_.shape
+                dim_size[0] = dim_size[0] * ctx.world_size
+                all_gather_output = paddle.empty(dim_size, dtype=input_.dtype)
+                all_gather_output.stop_gradient = True
+                all_gather_work = dist.stream.all_gather(all_gather_output, input_, group=ctx.group, sync_op=False)
+            else:
+                all_gather_output = ctx.all_gather_output
+
+            grad_input = paddle.matmul(grad_output, weight, transpose_y=True)
+            sub_grad_input = paddle.empty(input_.shape, dtype=input_.dtype)
+            reduce_scatter_work = dist.stream.reduce_scatter(
+                sub_grad_input, grad_input, group=ctx.group, sync_op=False
+            )
+
+            if __all_gather_recomputation__:
+                all_gather_work.wait()
+
+            grad_weight = (
+                paddle.matmul(all_gather_output, grad_output, transpose_x=True)
+                if not ctx.weight_stop_gradient
+                else None
+            )
+            reduce_scatter_work.wait()
+
+            return sub_grad_input, grad_weight
+
+    class MC2RowSeqParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.weight_stop_gradient = weight.stop_gradient
+            ctx.save_for_backward(input_, weight)
+
+            rank = dist.get_rank()
+            hcomm_info = group.process_group.get_comm_name(rank)
+            world_size = group.nranks
+
+            output = paddle_custom_device.npu.fused_mm_reduce_scatter(
+                input_,
+                weight,
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                reduce_op="sum",
+                comm_turn=0,
+            )
+
+            ctx.hcomm_info = hcomm_info
+            ctx.world_size = world_size
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input_, weight = ctx.saved_tensor()
+            hcomm_info = ctx.hcomm_info
+            world_size = ctx.world_size
+
+            grad_input, all_gather_grad_output = paddle_custom_device.npu.fused_allgather_mm(
+                grad_output,
+                weight.t(),
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                gather_index=0,
+                gather_output=True,
+                comm_turn=0,
+            )
+            grad_weight = (
+                paddle.matmul(input_, all_gather_grad_output, transpose_x=True)
+                if not ctx.weight_stop_gradient
+                else None
+            )
+
+            return grad_input, grad_weight
+
+    class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear):
+        def forward(self, x):
+            output = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            output = output + self.bias if self.bias is not None else output
+            return output
+
+    class MC2RowSeqParallelLinear(RowSequenceParallelLinear):
+        def forward(self, x):
+            output = MC2RowSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            output = output + self.bias if self.bias is not None else output
+            return output
+
+else:
+    MC2ColumnSeqParallelCoreLinear = None
+    MC2RowSeqParallelCoreLinear = None
+    MC2ColumnSeqParallelLinear = None
+    MC2RowSeqParallelLinear = None
+    MC2ColumnParallelCoreLinear = None
+    MC2RowParallelCoreLinear = None
diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py
deleted file mode 100644
index c39a78cc6252..000000000000
--- a/paddlenlp/transformers/mc2_seqence_parallel_linear.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import paddle
-
-try:
-    import paddle_custom_device
-except ImportError:
-    raise ImportError("Current device does not support MC2!")
-
-from paddle import distributed as dist
-from paddle.autograd import PyLayer
-
-try:
-    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-        ColumnSequenceParallelLinear,
-        RowSequenceParallelLinear,
-    )
-except:
-    pass
-
-__all_gather_recomputation__ = False
-if int(os.getenv("MC2_Recompute", 0)):
-    __all_gather_recomputation__ = True
-
-
-class MC2Column(PyLayer):
-    @staticmethod
-    def forward(ctx, input_, weight, group):
-        ctx.save_for_backward(input_, weight)
-
-        rank = dist.get_rank()
-        hcomm_info = group.process_group.get_comm_name(rank)
-
-        world_size = group.nranks
-        output, gather_out = paddle_custom_device.npu.fused_allgather_mm(
-            input_,
-            weight,
-            bias=None,
-            hcom=hcomm_info,
-            world_size=world_size,
-            gather_index=0,
-            gather_output=(not __all_gather_recomputation__),
-            comm_turn=0,
-        )
-
-        ctx.all_gather_output = gather_out
-        ctx.world_size = world_size
-        ctx.group = group
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input_, weight = ctx.saved_tensor()
-
-        if __all_gather_recomputation__:
-            dim_size = input_.shape
-            dim_size[0] = dim_size[0] * ctx.world_size
-            all_gather_output = paddle.empty(dim_size, dtype=input_.dtype)
-            all_gather_output.stop_gradient = True
-            all_gather_work = dist.stream.all_gather(all_gather_output, input_, group=ctx.group, sync_op=False)
-        else:
-            all_gather_output = ctx.all_gather_output
-
-        grad_input = paddle.matmul(grad_output, weight, transpose_y=True)
-        sub_grad_input = paddle.empty(input_.shape, dtype=input_.dtype)
-        reduce_scatter_work = dist.stream.reduce_scatter(sub_grad_input, grad_input, group=ctx.group, sync_op=False)
-
-        if __all_gather_recomputation__:
-            all_gather_work.wait()
-
-        grad_weight = paddle.matmul(all_gather_output, grad_output, transpose_x=True)
-        reduce_scatter_work.wait()
-
-        return sub_grad_input, grad_weight
-
-
-class MC2Row(PyLayer):
-    @staticmethod
-    def forward(ctx, input_, weight, group):
-        ctx.save_for_backward(input_, weight)
-
-        rank = dist.get_rank()
-        hcomm_info = group.process_group.get_comm_name(rank)
-        world_size = group.nranks
-
-        output = paddle_custom_device.npu.fused_mm_reduce_scatter(
-            input_,
-            weight,
-            bias=None,
-            hcom=hcomm_info,
-            world_size=world_size,
-            reduce_op="sum",
-            comm_turn=0,
-        )
-
-        ctx.hcomm_info = hcomm_info
-        ctx.world_size = world_size
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input_, weight = ctx.saved_tensor()
-        hcomm_info = ctx.hcomm_info
-        world_size = ctx.world_size
-
-        grad_input, all_gather_grad_output = paddle_custom_device.npu.fused_allgather_mm(
-            grad_output,
-            weight.t(),
-            bias=None,
-            hcom=hcomm_info,
-            world_size=world_size,
-            gather_index=0,
-            gather_output=True,
-            comm_turn=0,
-        )
-        grad_weight = paddle.matmul(input_, all_gather_grad_output, transpose_x=True)
-
-        return grad_input, grad_weight
-
-
-class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear):
-    def forward(self, x):
-        output = MC2Column.apply(x, self.weight, self.model_parallel_group)
-        output = output + self.bias if self.bias is not None else output
-        return output
-
-
-class MC2RowSeqParallelLinear(RowSequenceParallelLinear):
-    def forward(self, x):
-        output = MC2Row.apply(x, self.weight, self.model_parallel_group)
-        output = output + self.bias if self.bias is not None else output
-        return output

From 3105c18b013e1cdcbf860af1c6c54f4e33c88ee7 Mon Sep 17 00:00:00 2001
From: yujun <573009727@qq.com>
Date: Wed, 24 Apr 2024 15:10:53 +0800
Subject: [PATCH 10/27] fix 0f428bbe47daed3cd861f7047c3e9acbec4ea0b1 try import

---
 paddlenlp/peft/lora/lora_layers.py | 11 ++++++++++-
 paddlenlp/peft/lora/lora_model.py  | 29 ++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
index a31f7c3a33b1..7ac40ed0ba66 100644
--- a/paddlenlp/peft/lora/lora_layers.py
+++ b/paddlenlp/peft/lora/lora_layers.py
@@ -33,7 +33,16 @@
         mark_as_sequence_parallel_parameter,
     )
 except:
-    pass
+    AllGatherOp = None
+    ReduceScatterOp = None
+    mark_as_sequence_parallel_parameter = None
+
+    class ColumnSequenceParallelLinear:
+        pass
+
+    class RowSequenceParallelLinear:
+        pass
+
 
 from paddlenlp.transformers.mc2_parallel_linear import (
     MC2ColumnParallelCoreLinear,
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
index 41ab1e681e24..ebadf39a6a55 100644
--- a/paddlenlp/peft/lora/lora_model.py
+++ b/paddlenlp/peft/lora/lora_model.py
@@ -52,18 +52,25 @@
         RowSequenceParallelLinear,
     )
 
-    from .lora_layers import (
-        ColumnParallelLoRALinear,
-        ColumnParallelLoRAMergedLinear,
-        ColumnSequenceParallelLoRALinear,
-        LoRAConv2D,
-        LoRALinear,
-        LoRAMergedLinear,
-        RowParallelLoRALinear,
-        RowSequenceParallelLoRALinear,
-    )
 except:
-    pass
+
+    class ColumnSequenceParallelLinear:
+        pass
+
+    class RowSequenceParallelLinear:
+        pass
+
+
+from .lora_layers import (
+    ColumnParallelLoRALinear,
+    ColumnParallelLoRAMergedLinear,
+    ColumnSequenceParallelLoRALinear,
+    LoRAConv2D,
+    LoRALinear,
+    LoRAMergedLinear,
+    RowParallelLoRALinear,
+    RowSequenceParallelLoRALinear,
+)
 
 try:
     from ...quantization.quantization_linear import (

From 89daaa31776dcf22d34dcf2d830d82916452da20 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Fri, 26 Apr 2024 17:59:25 +0800
Subject: [PATCH 11/27] [Trainer] Fix sharding overlap bug (#8334)

---
 paddlenlp/trainer/training_args.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index 8ebd218447fc..2ed9d343ceaa 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -1020,6 +1020,11 @@ def __post_init__(self):
                         enable_dp_comm_overlap and enable_sharding_comm_overlap
                     ), "dp_comm_overlap and sharding_comm_overlap cannot be enabled at the same time"
 
+                    if enable_sharding_comm_overlap and not self.amp_master_grad:
+                        raise ValueError(
+                            "If `enable_sharding_comm_overlap` in pipeline_parallel_configs, `amp_master_grad` must be True."
+                        )
+
                     dygraph_pp_configs = {
                         "delay_scale_loss": True if "enable_delay_scale_loss" in pipeline_parallel_config else False,
                         "dp_comm_overlap": enable_dp_comm_overlap,

From 27d0e60cf7bcabce547b80b34c586bdf46d972a9 Mon Sep 17 00:00:00 2001
From: Kunbo Ding <kunbo_ding@163.com>
Date: Tue, 7 May 2024 17:52:04 +0800
Subject: [PATCH 12/27] Remove truncate (#8375)

Remove truncate
---
 paddlenlp/trainer/trainer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index 3e8fc333fe95..419349e02d21 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -2721,11 +2721,15 @@ def evaluation_loop(
         # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
         # samplers has been rounded to a multiple of batch_size, so we truncate.
         if all_losses is not None:
-            all_losses = all_losses[:num_samples]
+            all_losses = all_losses[: num_samples * int(self.args.world_size / self.args.dataset_world_size)]
         if all_preds is not None:
-            all_preds = nested_truncate(all_preds, num_samples)
+            all_preds = nested_truncate(
+                all_preds, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
         if all_labels is not None:
-            all_labels = nested_truncate(all_labels, num_samples)
+            all_labels = nested_truncate(
+                all_labels, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
 
         model.train()
 

From 9e4a4f473322f53e647b43b9031568a705e48080 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 9 May 2024 11:22:23 +0800
Subject: [PATCH 13/27] Fix llama3 eot id. (#8373)

---
 paddlenlp/transformers/llama/tokenizer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py
index 6f19530c05cb..46c16c58b427 100644
--- a/paddlenlp/transformers/llama/tokenizer.py
+++ b/paddlenlp/transformers/llama/tokenizer.py
@@ -295,11 +295,12 @@ def _pad(
 ENDOFTEXT = "<|end_of_text|>"
 IMSTART = "<|start_header_id|>"
 IMEND = "<|end_header_id|>"
+EOTID = "<|eot_id|>"
 # as the default behavior is changed to allow special tokens in
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
-EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250)))
-SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:]
+EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251)))
+SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:]
 
 tiktoken = None
 
@@ -354,9 +355,11 @@ def __init__(
 
         self.tokenizer = enc  # type: tiktoken.Encoding
 
+        self.bod_id = self.special_tokens[BEGINOFTEXT]
         self.eod_id = self.special_tokens[ENDOFTEXT]
         self.start_header_id = self.special_tokens[IMSTART]
         self.end_header_id = self.special_tokens[IMEND]
+        self.eot_id = self.special_tokens[EOTID]
 
         if "pad_token_id" in kwargs:
             self.pad_token_id = kwargs["pad_token_id"]

From debb2ad92d08825f553818e88b971245b50d2433 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Mon, 13 May 2024 11:19:28 +0800
Subject: [PATCH 14/27] [Trainer] update distributed dataloader (#8426)

* [DistDataloader] Update implementation, add nested.py (#8380)
* fix distdataloader, fix eval with dp group (#8420)
---
 paddlenlp/data/dist_dataloader.py             | 193 ++++++------------
 .../trainer/plugins/unified_checkpoint.py     |  21 +-
 paddlenlp/trainer/trainer.py                  |  88 +++++---
 paddlenlp/trainer/utils/helper.py             |  53 +----
 paddlenlp/utils/nested.py                     |  83 ++++++++
 5 files changed, 216 insertions(+), 222 deletions(-)
 create mode 100644 paddlenlp/utils/nested.py

diff --git a/paddlenlp/data/dist_dataloader.py b/paddlenlp/data/dist_dataloader.py
index e97cc60c84a8..5d5c6cc7512c 100644
--- a/paddlenlp/data/dist_dataloader.py
+++ b/paddlenlp/data/dist_dataloader.py
@@ -12,13 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle
 from paddle.distributed import fleet
 
 from paddlenlp.utils.log import logger
-
-_MAX_DATA_DIM = 64
+from paddlenlp.utils.nested import (
+    nested_broadcast_tensor,
+    nested_copy_place,
+    nested_empty_tensor,
+    nested_reduce_tensor,
+)
 
 
 class DummyDataset(paddle.io.Dataset):
@@ -53,6 +56,7 @@ def __init__(
         timeout=0,
         worker_init_fn=None,
         persistent_workers=False,
+        eval=False,
     ):
 
         if dataset is None:
@@ -62,12 +66,15 @@ def __init__(
         super().__init__(dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=num_workers)
 
         self._hcg = fleet.get_hybrid_communicate_group()
+        self.eval = eval
 
         # Init pp data comm group.
         if self._hcg.get_pipe_parallel_world_size() > 1:
             self._pp_data_group = self._init_dataloader_comm_group()
+            self._pp_group = self._hcg.get_pipe_parallel_group()
         else:
             self._pp_data_group = None
+            self._pp_group = None
 
         self.mp_group = self._hcg.get_model_parallel_group()
         self.mp_rank = self._hcg.get_model_parallel_rank()
@@ -78,10 +85,6 @@ def __init__(
         sharding_rank = self._hcg.get_sharding_parallel_rank()
         self._need_data = (self.mp_rank == 0) and (self.pp_rank == 0)
 
-        # When needed other data types, we can modify dtype_list.
-        self.dtype_list = [paddle.int64, paddle.float32, paddle.int32]
-        self._data_keys_list, self._data_keys_size = None, None
-
         if self._need_data:
             self._dataloader = paddle.io.DataLoader(
                 dataset,
@@ -127,7 +130,6 @@ def _init_dataloader_comm_group(self):
         parallel_groups = topo.get_comm_list("pipe")
 
         for group in parallel_groups:
-            # only first rank and last rank
             ranks = [group[0], group[-1]]
             comm_group = paddle.distributed.new_group(ranks=ranks)
             if paddle.distributed.get_rank() in ranks:
@@ -137,127 +139,68 @@ def _init_dataloader_comm_group(self):
     def __iter__(self):
         return self
 
-    def __next__(self):
-        data_keys_size = [0 for i in range(len(self.dtype_list))]
-        if self._need_data:
-            data = next(self._dataloader_iter)
-            data_keys = list(data.keys())
-
-            for key in data_keys:
-                if data[key].dtype not in self.dtype_list:
-                    raise ValueError(
-                        f"Dist dataloader requires dtype as `int64`, `float32` or `int32` currently, but got: {data[key].dtype}"
+    def _broadcast_data(self, data):
+        process_rank = paddle.distributed.get_rank()
+        if self.mp_group.nranks > 1:
+            if process_rank == self.mp_src_rank:
+                fake_data = [nested_reduce_tensor(data)]
+            else:
+                if data is not None:
+                    logger.warning(
+                        f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict."
                     )
-
-            data_list, data_keys_list = [], []
-            for i, dtype in enumerate(self.dtype_list):
-                data_list.append([data[key] for key in data_keys if data[key].dtype == dtype])
-                data_keys_list.append([key for key in data_keys if data[key].dtype == dtype])
-            data_keys_size = [len(keys) for keys in data_keys_list]
-
-        # Broadcast data keys size.
-        if self._data_keys_size is None:
-            if self.mp_group.nranks > 1 and self.pp_rank == 0:
-                paddle.distributed.broadcast_object_list(data_keys_size, src=self.mp_src_rank, group=self.mp_group)
-            if self._pp_data_group is not None:
-                paddle.distributed.broadcast_object_list(
-                    data_keys_size, src=self._pp_data_group.ranks[0], group=self._pp_data_group
-                )
-            self._data_keys_size = data_keys_size
-
-        if not self._need_data:
-            data_keys_list = [[None for i in range(keys_size)] for keys_size in self._data_keys_size]
-
-        # Broadcast data keys name.
-        if self._data_keys_list is None:
-            if self.mp_group.nranks > 1 and self.pp_rank == 0:
-                paddle.distributed.broadcast_object_list(data_keys_list, src=self.mp_src_rank, group=self.mp_group)
-            if self._pp_data_group is not None:
-                paddle.distributed.broadcast_object_list(
-                    data_keys_list, src=self._pp_data_group.ranks[0], group=self._pp_data_group
-                )
-            self._data_keys_list = data_keys_list
-
-        # Broadcast data.
-        if not self._need_data:
-            data_list = [[None for i in range(keys_size)] for keys_size in self._data_keys_size]
-
-        if self.mp_group.nranks > 1 and self.pp_rank == 0:
-            for i, dtype in enumerate(self.dtype_list):
-                if self._data_keys_size[i] > 0:
-                    data_list[i] = broadcast_data_list(
-                        data_list[i], dtype, self.mp_rank, self.mp_group, self.mp_src_rank
+                fake_data = [None]
+        if self._pp_group is not None:
+            if process_rank == self._pp_group.ranks[0]:
+                fake_data = [nested_reduce_tensor(data)]
+            else:
+                if data is not None:
+                    logger.warning(
+                        f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict."
                     )
+                fake_data = [None]
+        if self.mp_group.nranks > 1 and self.pp_rank == 0:
+            paddle.distributed.broadcast_object_list(
+                fake_data,
+                src=self.mp_src_rank,
+                group=self.mp_group,
+            )
+        if self._pp_group is not None:
+            paddle.distributed.broadcast_object_list(
+                fake_data,
+                src=self._pp_group.ranks[0],
+                group=self._pp_group,
+            )
 
-        if self._pp_data_group is not None:
-            # Note(daisimng): In last stage of pp, we don't need input_ids.
-            # It will be removed in future.
-            for i, dtype in enumerate(self.dtype_list):
-                if self._data_keys_size[i] > 0:
-                    data_list[i] = broadcast_data_list(
-                        data_list[i],
-                        dtype,
-                        self.pp_rank,
-                        self._pp_data_group,
-                        self._pp_data_group.ranks[0],
-                    )
-
-        out_data = {}
-        for keys, datas in zip(self._data_keys_list, data_list):
-            out_data.update([(k, d) for k, d in zip(keys, datas)])
-
-        return out_data
-
-
-def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_rank=0):
-    """
-    Broadcast data from src_rank to all ranks in comm_group.
-    """
-    # Move to GPU and broadcast.
-    size_cpu = []
-    if comm_rank == 0:
-        for data in data_list:
-            size_cpu.append(len(data.shape))
-            size_cpu += data.shape
-    size_cpu = size_cpu + [0] * (_MAX_DATA_DIM - len(size_cpu))
-    size_cuda = paddle.to_tensor(size_cpu)
-    paddle.distributed.broadcast(size_cuda, src_rank, group=comm_group).wait()
-
-    size_cpu = size_cuda.tolist()
-    i = 0
-    numel = 0
-    sizes = []
-    while size_cpu[i] > 0:
-        rank = size_cpu[i]
-        this_size = size_cpu[i + 1 : i + 1 + rank]
-        numel += int(np.prod(this_size))
-        sizes.append(this_size)
-        i += rank + 1
-
-    if comm_rank == 0:
-        assert data.dtype == datatype, "input has data type {} which " "is different than {}".format(
-            data.dtype, datatype
-        )
-        if paddle.is_compiled_with_cuda():
-            data_b = paddle.concat([d.cuda().reshape([-1]) for d in data_list], 0)
-        else:
-            data_b = paddle.concat([d.reshape([-1]) for d in data_list], 0)
+        fake_data = fake_data[0]
+        if fake_data is None:
+            raise StopIteration
 
-        assert numel == sum([d.numel().item() for d in data_list]), (numel, [d.numel().item() for d in data_list])
-    else:
-        if paddle.is_compiled_with_cuda():
-            data_b = paddle.empty([numel], dtype=datatype).cuda()
-        else:
-            data_b = paddle.empty([numel], dtype=datatype)
+        dst_pp_group = self._pp_group if self.eval else self._pp_data_group
+        if self.mp_group.nranks > 1:
+            if process_rank != self.mp_src_rank:
+                data = nested_empty_tensor(fake_data)
+        if dst_pp_group is not None:
+            if process_rank != dst_pp_group.ranks[0]:
+                data = nested_empty_tensor(fake_data)
 
-    # Broadcast
-    paddle.distributed.broadcast(data_b, src_rank, group=comm_group).wait()
+        if self.mp_group.nranks > 1 and self.pp_rank == 0:
+            data = nested_broadcast_tensor(data, src=self.mp_src_rank, group=self.mp_group)
+        if dst_pp_group is not None:
+            data = nested_broadcast_tensor(data, src=dst_pp_group.ranks[0], group=dst_pp_group)
+        # for pp1 - pp_{n-1}, Paddle need to recevie empty dict for pipeline parallel.
+        if data is None:
+            data = {}
 
-    ret = []
-    offset = 0
-    for size in sizes:
-        numel = int(np.prod(size))
-        ret.append(data_b[offset : offset + numel].reshape(size))
-        offset += numel
+        return data
 
-    return ret
+    def __next__(self):
+        data = None
+        if self._need_data:
+            try:
+                data = next(self._dataloader_iter)
+                data = nested_copy_place(data, place=paddle.framework._current_expected_place())
+            except:
+                pass
+        data = self._broadcast_data(data)
+        return data
diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py
index f8b62a15b77e..9a14ebba2882 100644
--- a/paddlenlp/trainer/plugins/unified_checkpoint.py
+++ b/paddlenlp/trainer/plugins/unified_checkpoint.py
@@ -62,6 +62,7 @@
     SAFE_WEIGHTS_NAME,
 )
 from paddlenlp.utils.log import logger
+from paddlenlp.utils.nested import nested_copy, nested_copy_place
 
 if is_safetensors_available():
     from safetensors import safe_open
@@ -1876,26 +1877,6 @@ def mapping_optimizer_tp_actions(tp_actions, optimizer_loaded_keys):
     return new_actions
 
 
-def nested_copy(inputs):
-    if isinstance(inputs, dict):
-        outputs = {}
-        for key in list(inputs.keys()):
-            outputs[key] = nested_copy(inputs[key])
-        return outputs
-    return inputs
-
-
-def nested_copy_place(inputs, place=None, blocking=False):
-    if isinstance(inputs, dict):
-        outputs = {}
-        for key in list(inputs.keys()):
-            outputs[key] = nested_copy_place(inputs[key], place, blocking)
-        return outputs
-    if isinstance(inputs, paddle.Tensor):
-        inputs = inputs if inputs.place == place else inputs._copy_to(place, blocking)
-    return inputs
-
-
 def flatten_list(nested_list):
     flattened_list = []
     for item in nested_list:
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index 419349e02d21..bf83420acf85 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -1419,8 +1419,6 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
         if is_datasets_available() and eval_dataset is not None and isinstance(eval_dataset, datasets.Dataset):
             eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
 
-        _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader
-
         if self._is_iterable_dataset(eval_dataset):
             if self.args.dataset_world_size > 1:
                 eval_dataset = IterableDatasetShard(
@@ -1431,24 +1429,41 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
                     process_index=self.args.dataset_rank,
                 )
 
-            return _DataLoader(
-                eval_dataset,
-                batch_size=self.args.per_device_eval_batch_size,
-                collate_fn=self.data_collator,
-                num_workers=self.args.dataloader_num_workers,
-            )
+            if self.args.distributed_dataloader:
+                return DistDataLoader(
+                    eval_dataset,
+                    batch_size=self.args.per_device_eval_batch_size,
+                    collate_fn=self.data_collator,
+                    num_workers=0,
+                    eval=True,
+                )
+            else:
+                return DataLoader(
+                    eval_dataset,
+                    batch_size=self.args.per_device_eval_batch_size,
+                    collate_fn=self.data_collator,
+                    num_workers=0,
+                )
 
         eval_sampler = self._get_eval_sampler(eval_dataset)
 
         if self.args.distributed_dataloader:
             logger.info("Eval using DistDataLoader.")
 
-        return _DataLoader(
-            eval_dataset,
-            batch_sampler=eval_sampler,
-            collate_fn=self.data_collator,
-            num_workers=self.args.dataloader_num_workers,
-        )
+            return DistDataLoader(
+                eval_dataset,
+                batch_sampler=eval_sampler,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                eval=True,
+            )
+        else:
+            return DataLoader(
+                eval_dataset,
+                batch_sampler=eval_sampler,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+            )
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         """
@@ -1469,8 +1484,6 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         if is_datasets_available() and test_dataset is not None and isinstance(test_dataset, datasets.Dataset):
             test_dataset = self._remove_unused_columns(test_dataset, description="test")
 
-        _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader
-
         if self._is_iterable_dataset(test_dataset):
             if self.args.dataset_world_size > 1:
                 test_dataset = IterableDatasetShard(
@@ -1481,25 +1494,42 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
                     process_index=self.args.dataset_rank,
                 )
 
-            return _DataLoader(
-                test_dataset,
-                batch_size=self.args.per_device_eval_batch_size * self.world_size,
-                collate_fn=self.data_collator,  # _get_collator_with_removed_columns
-                num_workers=self.args.dataloader_num_workers,
-            )
+            if self.args.distributed_dataloader:
+                return DistDataLoader(
+                    test_dataset,
+                    batch_size=self.args.per_device_eval_batch_size * self.world_size,
+                    collate_fn=self.data_collator,  # _get_collator_with_removed_columns
+                    num_workers=0,
+                    eval=True,
+                )
+            else:
+                return DataLoader(
+                    test_dataset,
+                    batch_size=self.args.per_device_eval_batch_size * self.world_size,
+                    collate_fn=self.data_collator,  # _get_collator_with_removed_columns
+                    num_workers=0,
+                )
 
         test_sampler = self._get_eval_sampler(test_dataset)
 
         if self.args.distributed_dataloader:
             logger.info("Test using DistDataLoader.")
 
-        # We use the same batch_size as for eval.
-        return _DataLoader(
-            test_dataset,
-            batch_sampler=test_sampler,
-            collate_fn=self.data_collator,
-            drop_last=self.args.dataloader_drop_last,
-        )
+            # We use the same batch_size as for eval.
+            return DistDataLoader(
+                test_dataset,
+                batch_sampler=test_sampler,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last,
+                eval=True,
+            )
+        else:
+            return DataLoader(
+                test_dataset,
+                batch_sampler=test_sampler,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last,
+            )
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
         """
diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py
index ff68e51f127b..25f593f71e35 100644
--- a/paddlenlp/trainer/utils/helper.py
+++ b/paddlenlp/trainer/utils/helper.py
@@ -16,8 +16,6 @@
 # This file is modified from
 #  https://github.com/huggingface/transformers/blob/main/src/transformers
 
-import collections
-import copy
 import os
 from typing import Any, Optional
 
@@ -27,6 +25,11 @@
 from paddle.distributed import fleet
 
 from paddlenlp.utils.log import logger
+from paddlenlp.utils.nested import (
+    nested_broadcast_tensor,
+    nested_empty_tensor,
+    nested_reduce_tensor,
+)
 
 __all__ = [
     "distributed_concat",
@@ -180,52 +183,6 @@ def distributed_file(filename):
         return filename
 
 
-TensorHolder = collections.namedtuple("TensorHolder", ["shape", "dtype", "name"])
-
-
-def nested_reduce_tensor(tensor):
-    if isinstance(tensor, dict):
-        # copy tensor since it will be inplace modified dict
-        tensor = copy.copy(tensor)
-        for key in list(tensor.keys()):
-            tensor[key] = nested_reduce_tensor(tensor[key])
-    if isinstance(tensor, (tuple, list)):
-        return type(tensor)(nested_reduce_tensor(t) for t in tensor)
-
-    if isinstance(tensor, paddle.Tensor):
-        return TensorHolder(tensor.shape, tensor.dtype, tensor.name)
-
-    return tensor
-
-
-def nested_empty_tensor(tensor):
-    if isinstance(tensor, dict):
-        for key in list(tensor.keys()):
-            tensor[key] = nested_empty_tensor(tensor[key])
-    if isinstance(tensor, list):
-        return type(tensor)(nested_empty_tensor(t) for t in tensor)
-
-    # TensorHolder is tuple
-    if isinstance(tensor, TensorHolder):
-        t = paddle.empty(tensor.shape, dtype=tensor.dtype, name=tensor.name)
-        t.name = tensor.name
-        return t
-
-    return tensor
-
-
-def nested_broadcast_tensor(tensor, src=0, group=None):
-    if isinstance(tensor, dict):
-        for key in list(tensor.keys()):
-            tensor[key] = nested_broadcast_tensor(tensor[key], src=src, group=group)
-    if isinstance(tensor, list):
-        return type(tensor)(nested_broadcast_tensor(t, src=src, group=group) for t in tensor)
-
-    if isinstance(tensor, paddle.Tensor):
-        paddle.distributed.broadcast(tensor, src=src, group=group, sync_op=True)
-    return tensor
-
-
 def broadcast_dp_optimizer(state_dict):
     if paddle.distributed.get_world_size() <= 1:
         return state_dict
diff --git a/paddlenlp/utils/nested.py b/paddlenlp/utils/nested.py
new file mode 100644
index 000000000000..27942b8cb256
--- /dev/null
+++ b/paddlenlp/utils/nested.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+
+import paddle
+
+TensorHolder = collections.namedtuple("TensorHolder", ["shape", "dtype", "name"])
+
+
+def nested_reduce_tensor(tensor):
+    if isinstance(tensor, dict):
+        # copy tensor since it will be inplace modified dict
+        tensor = copy.copy(tensor)
+        for key in list(tensor.keys()):
+            tensor[key] = nested_reduce_tensor(tensor[key])
+    if isinstance(tensor, (tuple, list)):
+        return type(tensor)(nested_reduce_tensor(t) for t in tensor)
+
+    if isinstance(tensor, paddle.Tensor):
+        return TensorHolder(tensor.shape, tensor.dtype, tensor.name)
+
+    return tensor
+
+
+def nested_empty_tensor(tensor):
+    if isinstance(tensor, dict):
+        for key in list(tensor.keys()):
+            tensor[key] = nested_empty_tensor(tensor[key])
+    if isinstance(tensor, list):
+        return type(tensor)(nested_empty_tensor(t) for t in tensor)
+
+    # TensorHolder is tuple
+    if isinstance(tensor, TensorHolder):
+        t = paddle.empty(tensor.shape, dtype=tensor.dtype, name=tensor.name)
+        t.name = tensor.name
+        return t
+
+    return tensor
+
+
+def nested_broadcast_tensor(tensor, src=0, group=None):
+    if isinstance(tensor, dict):
+        for key in list(tensor.keys()):
+            tensor[key] = nested_broadcast_tensor(tensor[key], src=src, group=group)
+    if isinstance(tensor, list):
+        return type(tensor)(nested_broadcast_tensor(t, src=src, group=group) for t in tensor)
+
+    if isinstance(tensor, paddle.Tensor):
+        paddle.distributed.broadcast(tensor, src=src, group=group, sync_op=True)
+    return tensor
+
+
+def nested_copy(inputs):
+    if isinstance(inputs, dict):
+        outputs = {}
+        for key in list(inputs.keys()):
+            outputs[key] = nested_copy(inputs[key])
+        return outputs
+    return inputs
+
+
+def nested_copy_place(inputs, place=None, blocking=False):
+    if isinstance(inputs, dict):
+        outputs = {}
+        for key in list(inputs.keys()):
+            outputs[key] = nested_copy_place(inputs[key], place, blocking)
+        return outputs
+    if isinstance(inputs, paddle.Tensor):
+        inputs = inputs if inputs.place == place else inputs._copy_to(place, blocking)
+    return inputs

From fc860a3289804fbaf197d12c6d858d0d79e741af Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 16 May 2024 17:09:22 +0800
Subject: [PATCH 15/27] Fix load RNG compatibility. (#8451)

---
 paddlenlp/trainer/trainer.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index bf83420acf85..e1d59e4cb747 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -1591,16 +1591,13 @@ def _load_rng_state(self, checkpoint):
                 if os.path.isfile(rng_file):
                     rng_file_list = paddle.load(rng_file, return_numpy=True)
             paddle.distributed.broadcast_object_list(rng_file_list, src=0)
-            # if rng_file_list still empty, then use old style rng_state
+            # if rng_file_list still empty, not log rng state.
             if rng_file_list[0] is None:
-                rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
-                if not os.path.isfile(rng_file):
-                    logger.info(
-                        f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
-                        "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
-                    )
-                    return
-                checkpoint_rng_state = paddle.load(rng_file, return_numpy=True)
+                logger.info(
+                    f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
+                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
+                )
+                return
             else:
                 checkpoint_rng_state = rng_file_list[process_index]
         else:

From 08898bf1e0429db3da6d0b3e8a95e8b7d8c817d7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 20 May 2024 13:05:30 +0800
Subject: [PATCH 16/27] Cherry-Pick fast_safe_open (#8458)

* [Performance] Optimize unified checkpoint save/load speed. (#8204)

* opt unified checkpoint save/load speed.
---
 .../trainer/plugins/unified_checkpoint.py     |  51 +--
 paddlenlp/trainer/trainer.py                  |   1 +
 paddlenlp/transformers/conversion_utils.py    |  16 +-
 paddlenlp/transformers/model_utils.py         |  51 ++-
 paddlenlp/utils/safetensors.py                | 312 ++++++++++++++++++
 tests/trainer/test_unified_checkpoint.py      |  40 +++
 .../clap/test_feature_extraction.py           |   1 +
 .../ernie_vil/test_image_processing.py        |   1 +
 .../speecht5/test_feature_extraction.py       |   1 +
 tests/transformers/test_safetensors.py        |  57 ++++
 10 files changed, 490 insertions(+), 41 deletions(-)
 create mode 100644 paddlenlp/utils/safetensors.py
 create mode 100644 tests/transformers/test_safetensors.py

diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py
index 9a14ebba2882..a8e1199a59b8 100644
--- a/paddlenlp/trainer/plugins/unified_checkpoint.py
+++ b/paddlenlp/trainer/plugins/unified_checkpoint.py
@@ -30,6 +30,7 @@
 from paddlenlp.transformers.model_utils import (
     PretrainedModel,
     _load_state_dict_into_model,
+    faster_set_state_dict,
     get_parameter_dtype,
     load_state_dict,
     unwrap_model,
@@ -65,9 +66,10 @@
 from paddlenlp.utils.nested import nested_copy, nested_copy_place
 
 if is_safetensors_available():
-    from safetensors import safe_open
+    # from safetensors import safe_open
     from safetensors.numpy import save_file as safe_save_file
 
+    from paddlenlp.utils.safetensors import fast_safe_open as safe_open
 
 FP32_MASTER = "fp32_master_0"
 optimizer_scalar_name = [
@@ -91,6 +93,11 @@
 async_save_queue = []
 
 
+DEST_PLACE = paddle.CPUPlace()
+if paddle.device.is_compiled_with_cuda():
+    DEST_PLACE = paddle.CUDAPinnedPlace()
+
+
 class UnifiedCheckpointOption(ExplicitEnum):
     """
     "- skip_save_model_weight: do not save model weights when the masters weight exist\n"
@@ -196,7 +203,6 @@ def load_unified_checkpoint(args, model, optimizer, resume_from_checkpoint: str,
     Returns:
         None
     """
-
     if paddle.distributed.get_world_size() <= 1:
         load_single_card_checkpoint(args, model, resume_from_checkpoint)
         return
@@ -222,7 +228,6 @@ def load_unified_checkpoint_locally(args, model, resume_from_checkpoint: str, sa
         pretrained_model_name_or_path=resume_from_checkpoint,
         index_filename=os.path.join(resume_from_checkpoint, index_filename),
     )
-
     loaded_keys = sharded_metadata["all_checkpoint_keys"]
 
     model_state_dict = get_expected_state_dict(model)
@@ -266,7 +271,9 @@ def _remove_unused_keys(
             else:
                 tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True)
         # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
-        state_dict = load_state_dict(shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys)
+        state_dict = load_state_dict(
+            shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected"
+        )
 
         if not pre_tensor_parallel_split:
             # Since we load all keys but we only need one of pipeline stages
@@ -279,11 +286,12 @@ def _remove_unused_keys(
                 None, model.config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1
             )
 
-        error_msgs += _load_state_dict_into_model(model, state_dict, "")
+        # error_msgs += _load_state_dict_into_model(model, state_dict, "")
+        error_msgs += faster_set_state_dict(model, state_dict, strict_dtype=False)
 
         # force memory release
         del state_dict
-        gc.collect()
+        # gc.collect()
 
     if len(error_msgs) > 0:
         error_msg = "\n\t".join(error_msgs)
@@ -337,6 +345,7 @@ def unified_checkpoint_into_shards(
             tp_actions = model_to_save.get_tensor_parallel_convert_actions(
                 model_to_save.config, state_dict.keys(), is_split=False, ignore_error=True
             )
+        logger.info("Unified model tensor parallel weights in shards")
         state_dict = merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys)
 
     # build index json file
@@ -490,6 +499,7 @@ def load_unified_optimizer_locally(args, model, optimizer, resume_from_checkpoin
     # This should always be a list but, just to be sure.
     if not isinstance(resolved_archive_file, list):
         resolved_archive_file = [resolved_archive_file]
+
     if len(resolved_archive_file) > 1:
         resolved_archive_file = tqdm(resolved_archive_file, desc="Loading optimizer shards")
 
@@ -537,10 +547,10 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
                         tp_actions = mapping_optimizer_tp_actions(tp_actions, expected_keys)
 
                     # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
-                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys)
+                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys, device="expected")
                 else:
                     # for pipeline model, we don't need to use tp_actions
-                    state_dict = load_state_dict(shard_file, None, expected_keys)
+                    state_dict = load_state_dict(shard_file, None, expected_keys, device="expected")
 
             returned_state_dict.update(state_dict)
             # force memory release
@@ -553,7 +563,6 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
         state_dict_master_weight = load_resolved_archive_file(
             resolved_archive_file_mw, sharded_metadata_mw, expected_keys_mw, is_master_weights=True
         )
-
     # rename optimizer param
     for key in list(state_dict_optim.keys()):
         key_name = key.split("/")
@@ -562,13 +571,13 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
             key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
         else:
             key_name = "_".join([static_name, key_name[1]])
-        returned_optim_state_dict[key_name] = state_dict_optim[key]
+        returned_optim_state_dict[key_name] = state_dict_optim.pop(key)
         returned_optim_state_dict[key_name].name = key_name
 
     if has_master_weights:
         for key in list(state_dict_master_weight.keys()):
             static_name = struct2static_name_mappings[key]
-            returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight[key]
+            returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight.pop(key)
             returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
 
     returned_optim_state_dict = nested_copy_place(
@@ -640,6 +649,7 @@ def unified_optimizer_into_shards(
             tp_actions = model.get_tensor_parallel_convert_actions(
                 model.config, model_keys, is_split=False, ignore_error=True
             )
+        logger.info("Unified optimizer tensor parallel in shards")
         optim_state_dict = merge_tensor_parallel_for_optimizer(
             optim_state_dict,
             tp_actions,
@@ -648,6 +658,7 @@ def unified_optimizer_into_shards(
         paddle.device.cuda.empty_cache()
 
         if master_weights is not None:
+            logger.info("Unified master weight tensor parallel in shards")
             master_weights = merge_tensor_parallel_for_optimizer(
                 master_weights,
                 tp_actions,
@@ -703,7 +714,6 @@ def unified_optimizer_into_shards(
 def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serialization=False):
     index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=False)
     index_filename = os.path.join(resume_from_checkpoint, index_filename)
-
     # Find index json file and distribute this file in global group.
     if distributed_isfile(index_filename):
         distributed_file(index_filename)
@@ -1605,7 +1615,9 @@ def gather_sharded_object(index_file, total_size, is_optimizer=False):
     tp_group = hcg.get_model_parallel_group()
     pp_group = hcg.get_pipe_parallel_group()
 
-    logger.info("Unified checkpoint generating sharded_index json files.")
+    logger.info(
+        f"Unified checkpoint: generating sharded_index json files for {'optimizer or master weight' if is_optimizer else 'model weight'}."
+    )
 
     if tp_group.nranks > 1:
         dist.all_gather_object(index_file_list, index_file, tp_group)
@@ -1714,8 +1726,6 @@ def filter_params(model_to_save, state_dict, is_optimizer=False):
 
 
 def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
-    logger.info("Unified checkpoint merge tensor parallel in shards")
-
     hcg = fleet.get_hybrid_communicate_group()
     tp_group = hcg.get_model_parallel_group()
     tp_rank = tp_group.rank
@@ -1741,7 +1751,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
                 action = tp_actions.pop(key)
                 tensor = action(ret) if is_dst else None
             else:
-                tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
+                tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None
 
             if is_dst:
                 state_dict_to_save[key] = tensor
@@ -1754,8 +1764,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
 
 
 def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys):
-    logger.info("Unified optimizer tensor parallel in shards")
-
+    # Core function for UC
     hcg = fleet.get_hybrid_communicate_group()
     tp_group = hcg.get_model_parallel_group()
     tp_rank = tp_group.rank
@@ -1773,15 +1782,13 @@ def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys)
             if model_key in tp_actions:
                 # for example: beta1, beta2
                 if tensor.numel().item() == 1:
-                    tensor = (
-                        tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
-                    )  # Need broadcast when loaded
+                    tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None  # Need broadcast when loaded
                 else:
                     ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
                     action = tp_actions[model_key]
                     tensor = action(ret) if is_dst else None
             else:
-                tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
+                tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None
 
             if is_dst:
                 state_dict_to_save[filter_keys[i]] = tensor
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index e1d59e4cb747..746b7e252516 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -2419,6 +2419,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             self.runtime_timer.stop()
             return
 
+        logger.info("Loading optimizer and scheduler...")
         if (not self.args.should_load_sharding_stage1_model) and self.args.ignore_load_lr_and_optim:
             self.runtime_timer.stop()
             return
diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py
index 660e79f6a3e5..ba5169454d0b 100644
--- a/paddlenlp/transformers/conversion_utils.py
+++ b/paddlenlp/transformers/conversion_utils.py
@@ -285,8 +285,12 @@ def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2):
 
     if isinstance(weight_list[0], np.ndarray):
         return np.concatenate([reorder[i] for i in index], axis=axis)
+    else:
+        tensor = paddle.concat([reorder[i] for i in index], axis=axis)
 
-    return paddle.concat([reorder[i] for i in index], axis=axis)._copy_to(paddle.CPUPlace(), False)
+        if tensor.place.is_gpu_place():
+            tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+        return tensor
 
 
 def naive_fuse_split_tp(
@@ -361,12 +365,18 @@ def normal_fuse_merge_tp(weight_list, is_column=True):
         if isinstance(weight_list[0], np.ndarray):
             return np.concatenate(weight_list, axis=-1)
         else:
-            return paddle.concat(weight_list, axis=-1)._copy_to(paddle.CPUPlace(), False)
+            tensor = paddle.concat(weight_list, axis=-1)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
     else:
         if isinstance(weight_list[0], np.ndarray):
             return np.concatenate(weight_list, axis=0)
         else:
-            return paddle.concat(weight_list, axis=0)._copy_to(paddle.CPUPlace(), False)
+            tensor = paddle.concat(weight_list, axis=0)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
 
 
 def normal_fuse_split_tp(weight, tensor_parallel_degree, tensor_parallel_rank=None, is_column=True):
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index 1ddd7e1c2913..dc1c753206c4 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -109,10 +109,13 @@ def unwrap_optimizer(optimizer, optimizer_instances=()):
 
 if is_safetensors_available():
 
-    from safetensors import safe_open
-    from safetensors.numpy import load_file as safe_load_file
+    # from safetensors import safe_open
+    # from safetensors.numpy import load_file as safe_load_file
     from safetensors.numpy import save_file as safe_save_file
 
+    from paddlenlp.utils.safetensors import fast_load_file as safe_load_file
+    from paddlenlp.utils.safetensors import fast_safe_open as safe_open
+
 
 def prune_linear_layer(layer: nn.Linear, index: paddle.Tensor, dim: int = 0) -> nn.Linear:
     """
@@ -313,7 +316,7 @@ def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
 
 
 def load_state_dict(
-    checkpoint_file: Union[str, os.PathLike], tensor_parallel_split_mapping=None, fliter_dict_keys=None
+    checkpoint_file: Union[str, os.PathLike], tensor_parallel_split_mapping=None, fliter_dict_keys=None, device="cpu"
 ):
     """
     Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise.
@@ -346,11 +349,16 @@ def load_state_dict(
                         weight = tensor_parallel_split_mapping[key](py_safe_slice_)
                     else:
                         weight = py_safe_slice_[:]
+                    if device == "expected":
+                        with device_guard():
+                            weight = paddle.Tensor(weight, zero_copy=True)
+                        weight = weight._copy_to(paddle.framework._current_expected_place(), False)
                     state_dict[key] = weight
 
-            for k in list(state_dict.keys()):
-                with device_guard():
-                    state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
+            if device == "cpu":
+                for k in list(state_dict.keys()):
+                    with device_guard():
+                        state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
 
             return state_dict
 
@@ -672,8 +680,10 @@ def load_sharded_checkpoint(model, folder, variant=None, strict=True, prefer_saf
     return missing_keys, unexpected_keys
 
 
-def faster_set_state_dict(model, state_dict):
+def faster_set_state_dict(model, state_dict, strict_dtype=True):
     # the state_dict will be destroied.
+    unused_keys = set(state_dict.keys())
+    unset_keys = set(model.state_dict().keys())
     with paddle.no_grad():
         for k, v in model.state_dict().items():
             if k in state_dict:
@@ -683,8 +693,10 @@ def faster_set_state_dict(model, state_dict):
                         f"faster_set_state_dict need state dict with paddle.Tensor, but got {type(v_new)}"
                     )
                 # 2. cast param / Tensor to dtype
+                #
                 if v.dtype != v_new.dtype:
-                    raise ValueError(f"for key: {k}, expect dtype {v.dtype}, but got {v_new.dtype}")
+                    if strict_dtype or (not v.is_floating_point() or not v_new.is_floating_point()):
+                        raise ValueError(f"for key: {k}, expect dtype {v.dtype}, but got {v_new.dtype}")
                 # check shape
                 if list(v.shape) != list(v_new.shape):
                     raise ValueError(f"for key: {k}, expect shape {v.shape}, but got {v_new.shape}")
@@ -700,9 +712,22 @@ def faster_set_state_dict(model, state_dict):
                 else:
                     new_t = v_new
 
+                if not strict_dtype and v.dtype != new_t.dtype:
+                    new_t = new_t.astype(v.dtype)
+
                 # 4. share Tensor to origin param / Tensor
                 src_tensor = new_t.value().get_tensor()
                 dst_tensor._share_data_with(src_tensor)
+                unset_keys.remove(k)
+                unused_keys.remove(k)
+
+    error_msgs = []
+    # if len(unset_keys) > 0:
+    #    error_msgs.append(f"Those weight of model is not initialized: {list(unset_keys)}")
+    if len(unused_keys) > 0:
+        error_msgs.append(f"Those state dict keys are not using in model: {list(unused_keys)}")
+
+    return error_msgs
 
 
 def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
@@ -734,9 +759,8 @@ def _convert_state_dict_dtype_and_shape(state_dict, model_to_load):
     def is_0d_or_1d(tensor):
         return len(tensor.shape) == 0 or list(tensor.shape) == [1]
 
-    expected_place = paddle.framework._current_expected_place()
     for key, value in model_to_load.state_dict().items():
-        if key in state_dict:
+        if key in list(state_dict.keys()):
             if isinstance(state_dict[key], np.ndarray):
                 raise ValueError(
                     "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, plase convert numpy.ndarray to paddle.Tensor"
@@ -744,12 +768,7 @@ def is_0d_or_1d(tensor):
             # confirm parameter cast is executed on the same device as model
             # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it
             if state_dict[key].is_floating_point() and state_dict[key].dtype != value.dtype:
-                value_pop = state_dict.pop(key)
-                value_new_place = (
-                    value_pop if value_pop.place == expected_place else value_pop._copy_to(expected_place, False)
-                )
-                state_dict[key] = paddle.cast(value_new_place, value.dtype)._copy_to(value_pop.place, False)
-                del value_new_place
+                state_dict[key] = paddle.cast(state_dict.pop(key), value.dtype)
             # unified 0d and 1d tensor
             if is_0d_or_1d(value) and is_0d_or_1d(state_dict[key]):
                 if list(value.shape) != list(state_dict[key].shape):
diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py
new file mode 100644
index 000000000000..422a7d09961c
--- /dev/null
+++ b/paddlenlp/utils/safetensors.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import mmap
+from collections import OrderedDict
+
+import numpy as np
+
+__all__ = [
+    "fast_safe_open",
+    "fast_load_file",
+]
+
+
+MAX_HEADER_SIZE = 100 * 1000 * 1000
+
+dtype_size = {
+    "BOOL": 1,
+    "U8": 1,
+    "I8": 1,
+    "F8_E5M2": 1,
+    "F8_E4M3": 1,
+    "I16": 2,
+    "U16": 2,
+    "I32": 4,
+    "U32": 4,
+    "I64": 8,
+    "U64": 8,
+    "F16": 2,
+    "BF16": 2,
+    "F32": 4,
+    "F64": 8,
+}
+
+numpy_dtype = {
+    "BOOL": np.bool_,
+    "U8": np.uint8,
+    "I8": np.int8,
+    "F8_E5M2": 1,  # no fp8
+    "F8_E4M3": 1,  # no fp8
+    "I16": np.int16,
+    "U16": np.uint16,
+    "I32": np.int32,
+    "U32": np.uint32,
+    "I64": np.int64,
+    "U64": np.uint64,
+    "F16": np.float16,
+    "BF16": 2,  # no bf16
+    "F32": np.float32,
+    "F64": np.float64,
+}
+
+
+def getSize(fileobject):
+    fileobject.seek(0, 2)  # move the cursor to the end of the file
+    size = fileobject.tell()
+    fileobject.seek(0)  # move the cursor to the start of the file
+    return size
+
+
+def metadata_validate(metadata):
+    start = 0
+    for key, info in metadata.items():
+        s, e = info["data_offsets"]
+        if s != start or e < s:
+            raise ValueError(f"SafeTensorError::InvalidOffset({key})")
+        start = e
+        nelements = np.prod(info["shape"])
+        nbytes = nelements * dtype_size[info["dtype"]]
+        if (e - s) != nbytes:
+            raise ValueError("SafeTensorError::TensorInvalidInfo")
+    return start
+
+
+def read_metadata(buffer):
+    buffer_len = getSize(buffer)
+    if buffer_len < 8:
+        raise ValueError("SafeTensorError::HeaderTooSmall")
+
+    n = np.frombuffer(buffer.read(8), dtype=np.uint64).item()
+    if n > MAX_HEADER_SIZE:
+        raise ValueError("SafeTensorError::HeaderTooLarge")
+
+    stop = n + 8
+    if stop > buffer_len:
+        raise ValueError("SafeTensorError::InvalidHeaderLength")
+
+    tensors = json.loads(buffer.read(n), object_pairs_hook=OrderedDict)
+    metadata = tensors.pop("__metadata__", None)
+    buffer_end = metadata_validate(tensors)
+
+    if buffer_end + 8 + n != buffer_len:
+        raise ValueError("SafeTensorError::MetadataIncompleteBuffer")
+
+    return stop, tensors, metadata
+
+
+def readinto_numpy(meta, buffer, base_ptr):
+    def create_empty(info):
+        return np.empty(shape=info["shape"], dtype=numpy_dtype[info["dtype"]])
+
+    ret = {}
+    for k, v in meta.items():
+        t = create_empty(v)
+        buffer.seek(base_ptr + v["data_offsets"][0])
+        buffer.readinto(memoryview(t))
+        ret[k] = t
+    return ret
+
+
+class PySafeSlice:
+    def __init__(self, info, bufferfile, base_ptr, buffermmap):
+        self.info = info
+        self.bufferfile = bufferfile
+        self.buffermmap = buffermmap
+        self.base_ptr = base_ptr
+
+        self.start = [0 for dim in self.shape]
+        self.stop = [dim for dim in self.shape]
+        self.step = [1 for dim in self.shape]
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    def __getitem__(self, index):
+        # https://github.com/numpy/numpy/blob/4d652465cea38e9504f954ac708d91e4954bd13a/numpy/lib/_arrayterator_impl.py#L96-L126
+        # Fix index, handling ellipsis and incomplete slices.
+        if not isinstance(index, tuple):
+            index = (index,)
+        fixed = []
+        length, dims = len(index), self.ndim
+        for slice_ in index:
+            if slice_ is Ellipsis:
+                fixed.extend([slice(None)] * (dims - length + 1))
+                length = len(fixed)
+            elif isinstance(slice_, int):
+                fixed.append(slice(slice_, slice_ + 1, 1))
+            else:
+                fixed.append(slice_)
+        index = tuple(fixed)
+        if len(index) < dims:
+            index += (slice(None),) * (dims - len(index))
+
+        out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step))
+        for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)):
+            out_start[i] = slice_.start or 0
+            out_step[i] = slice_.step or 1
+            out_stop[i] = slice_.stop or stop - start
+            out_stop[i] = min(stop, out_stop[i])
+
+        target_shape = []
+        for x, y, z in zip(out_start, out_stop, out_step):
+            assert z == 1, "only support step = 1"
+            if y - x > 1:
+                target_shape.append(int(y - x))
+
+        if len(target_shape) == 0:
+            if self.shape == [1]:
+                target_shape = self.shape
+
+        # https://github.com/huggingface/safetensors/blob/b947b59079a6197d7930dfb535818ac4896113e8/safetensors/src/slice.rs#L297-L315
+        indices = []
+        span = self.bits
+        for i, (start, stop, step) in enumerate(zip(out_start[::-1], out_stop[::-1], out_step[::-1])):
+            if len(indices) == 0:
+                if start == 0 and stop == self.shape[i]:
+                    pass
+                    #  We haven't started to slice yet, just increase the span
+                else:
+                    offset = start * span
+                    small_span = stop * span - offset
+                    indices.append((offset, offset + small_span))
+
+            else:
+                capacity = (stop - start) * len(indices)
+                newindices = []
+                for n in range(start, stop):
+                    offset = n * span
+                    for (old_start, old_stop) in indices:
+                        newindices.append((old_start + offset, old_stop + offset))
+                indices = newindices
+                assert len(indices) == capacity, f"error {capacity} {len(indices)}"
+            span *= self.shape[-(i + 1)]
+
+        if len(indices) == 0:
+            indices.append((0, self.nbytes))
+
+        merge_indices = []
+        last_end = -1
+        last_start = -1
+        for start, end in indices:
+            if start == last_end:
+                last_end = end
+                continue
+            else:
+                if last_start != -1:
+                    merge_indices.append((last_start, last_end))
+                last_start = start
+                last_end = end
+        if last_start != -1:
+            merge_indices.append((last_start, last_end))
+        tensor = np.empty(shape=[1] if len(target_shape) == 0 else np.prod(target_shape), dtype=self.dtype)
+
+        tensor_view = memoryview(tensor.view(np.uint8).reshape(-1))
+        curr_data_ptr = 0
+        # if to many slice and each slice < 1M
+        if len(merge_indices) > 128 and (merge_indices[0][1] - merge_indices[0][0] < 1024 * 1024):
+            # Use mmap for random access
+            for start, end in merge_indices:
+                data_len = end - start
+                tensor_view[curr_data_ptr : curr_data_ptr + data_len] = self.buffermmap[
+                    self.start_offset + start : self.start_offset + end
+                ]
+                curr_data_ptr += data_len
+        else:
+            # Use file read for sequence access
+            for start, end in merge_indices:
+                data_len = end - start
+                self.bufferfile.seek(self.start_offset + start)
+                view = tensor_view[curr_data_ptr : curr_data_ptr + data_len]
+                self.bufferfile.readinto(view)
+                curr_data_ptr += data_len
+
+        return tensor.reshape(target_shape)
+
+    def get(self, *args, **kwargs):
+        tensor = np.empty(shape=self.shape, dtype=self.dtype)
+        self.bufferfile.seek(self.start_offset)
+        self.bufferfile.readinto(memoryview(tensor))
+        return tensor
+
+    @property
+    def start_offset(self):
+        return self.base_ptr + self.info["data_offsets"][0]
+
+    def get_shape(self):
+        return self.shape
+
+    @property
+    def shape(self):
+        return self.info["shape"]
+
+    @property
+    def dtype(self):
+        return numpy_dtype[self.info["dtype"]]
+
+    @property
+    def nelements(self):
+        return np.prod(self.info["shape"])
+
+    @property
+    def bits(self):
+        return dtype_size[self.info["dtype"]]
+
+    @property
+    def nbytes(self):
+        return self.nelements * dtype_size[self.info["dtype"]]
+
+
+# a simple file writer object
+class fast_safe_open:
+    def __init__(self, filename, framework=None, device="cpu"):
+        self.filename = filename
+        self.framework = framework
+        self.file = open(self.filename, "rb")
+        self.file_mmap = mmap.mmap(self.file.fileno(), 0, flags=mmap.MAP_PRIVATE)
+        self.base, self.tensors_decs, self.__metadata__ = read_metadata(self.file)
+        self.tensors = OrderedDict()
+        for key, info in self.tensors_decs.items():
+            self.tensors[key] = PySafeSlice(info, self.file, self.base, self.file_mmap)
+            self.tensors[key].key = key
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.file_mmap.close()
+        self.file.close()
+
+    def metadata(self):
+        return self.__metadata__
+
+    def keys(self):
+        return list(self.tensors.keys())
+
+    def get_tensor(self, name):
+        return self.tensors[name].get()
+
+    def get_slice(self, name):
+        return self.tensors[name]
+
+
+def fast_load_file(filename):
+    result = {}
+    with fast_safe_open(filename, framework="np") as f:
+        for k in f.keys():
+            result[k] = f.get_tensor(k)
+    return result
diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py
index 9b91905841af..f8cc0ed7bfac 100644
--- a/tests/trainer/test_unified_checkpoint.py
+++ b/tests/trainer/test_unified_checkpoint.py
@@ -48,6 +48,7 @@
     "Flags_skip_mp_c_identity": "1",
     "FLAGS_shard_norm_align_dp": "0",
     "FLAGS_shard_use_reduce": "1",
+    "FLAGS_eager_communication_connection": "1",  # no lazy init comm group
     "test_ci_no_save_model": "1",
 }
 
@@ -1137,3 +1138,42 @@ def runfrist(self, train_args):
 
     def rerun(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
+
+
+@pytest.mark.skipif(True, reason="Skip for None CE")
+class TestUnifiedCheckpointOnN1C8SaveLoadSpeed(TestUnifiedCheckpointFull):
+    def setUp(self):
+        super().setUp()
+        for config_key in self.configs:
+            self.configs[config_key]["skip_profile_timer"] = 0
+            self.configs[config_key]["unified_checkpoint"] = 1
+            self.configs[config_key]["save_steps"] = 6
+            self.configs[config_key]["unified_checkpoint_config"] = "skip_save_model_weight master_weight_compatible"
+
+        self.need_allclose = False
+        self.rtol = 1e-7
+
+    def runfrist(self, train_args):
+        self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args)
+
+    def rerun(self, train_args):
+        self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args)
+
+
+@pytest.mark.skipif(True, reason="Skip for None CE")
+class TestPaddleCheckpointOnN1C8SaveLoadSpeed(TestUnifiedCheckpointFull):
+    def setUp(self):
+        super().setUp()
+        for config_key in self.configs:
+            self.configs[config_key]["skip_profile_timer"] = 0
+            self.configs[config_key]["unified_checkpoint"] = 0
+            self.configs[config_key]["save_steps"] = 6
+
+        self.need_allclose = False
+        self.rtol = 1e-7
+
+    def runfrist(self, train_args):
+        self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args)
+
+    def rerun(self, train_args):
+        self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args)
diff --git a/tests/transformers/clap/test_feature_extraction.py b/tests/transformers/clap/test_feature_extraction.py
index 413f69276e0d..d78e476d14b5 100644
--- a/tests/transformers/clap/test_feature_extraction.py
+++ b/tests/transformers/clap/test_feature_extraction.py
@@ -68,6 +68,7 @@ def __init__(
         self.feature_size = feature_size
         self.chunk_length = chunk_length
         self.hop_length = hop_length
+        super().__init__()
 
     def prepare_feat_extract_dict(self):
         return {
diff --git a/tests/transformers/ernie_vil/test_image_processing.py b/tests/transformers/ernie_vil/test_image_processing.py
index 0f224ec951b7..d95217505902 100644
--- a/tests/transformers/ernie_vil/test_image_processing.py
+++ b/tests/transformers/ernie_vil/test_image_processing.py
@@ -58,6 +58,7 @@ def __init__(
         self.image_mean = image_mean
         self.image_std = image_std
         self.do_convert_rgb = do_convert_rgb
+        super().__init__()
 
     def prepare_image_processor_dict(self):
         return {
diff --git a/tests/transformers/speecht5/test_feature_extraction.py b/tests/transformers/speecht5/test_feature_extraction.py
index 067108b9c948..b2f63b87a972 100644
--- a/tests/transformers/speecht5/test_feature_extraction.py
+++ b/tests/transformers/speecht5/test_feature_extraction.py
@@ -81,6 +81,7 @@ def __init__(
         self.fmax = fmax
         self.mel_floor = mel_floor
         self.return_attention_mask = return_attention_mask
+        super().__init__()
 
     def prepare_feat_extract_dict(self):
         return {
diff --git a/tests/transformers/test_safetensors.py b/tests/transformers/test_safetensors.py
new file mode 100644
index 000000000000..3c143e26a0b5
--- /dev/null
+++ b/tests/transformers/test_safetensors.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+# from safetensors import safe_open
+from safetensors.numpy import load_file, save_file
+
+from paddlenlp.utils.safetensors import fast_load_file, fast_safe_open
+
+
+class FastSafetensors(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.weigth_map = {}
+        tensors = [([10, 10], "float32"), ([8], "float16"), ([5, 5, 5], "int32")]
+        count = 0
+        for shape, dtype in tensors:
+            self.weigth_map[f"weight_{count}"] = (np.random.random(shape) * 100).astype(dtype)
+            count += 1
+        print(self.weigth_map)
+
+    def test_load_file(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            path = os.path.join(tmpdirname, "test.safetensors")
+            save_file(self.weigth_map, path, metadata={"format": "np"})
+            sf_load = load_file(path)
+            fs_sf_load = fast_load_file(path)
+            for k, v in self.weigth_map.items():
+                np.testing.assert_equal(v, sf_load[k])
+                np.testing.assert_equal(v, fs_sf_load[k])
+
+    def test_safe_open(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            path = os.path.join(tmpdirname, "test.safetensors")
+            save_file(self.weigth_map, path, metadata={"format": "np"})
+
+            with fast_safe_open(path, framework="np") as f:
+                for key in f.keys():
+                    safe_slice = f.get_slice(key)
+                    np.testing.assert_equal(self.weigth_map[key][:2, ...], safe_slice[:2, ...])
+                    np.testing.assert_equal(self.weigth_map[key][..., :4], safe_slice[..., :4])

From 7a24bccfd15348e818036c56335fccf984fd95d5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Tue, 21 May 2024 11:24:11 +0800
Subject: [PATCH 17/27] Cherry pick type promotion fix. (#8463)

---
 paddlenlp/generation/utils.py                   |  4 +++-
 paddlenlp/layers/crf.py                         |  2 +-
 paddlenlp/metrics/perplexity.py                 |  2 +-
 paddlenlp/prompt/verbalizer.py                  |  2 +-
 paddlenlp/transformers/convbert/modeling.py     |  4 +++-
 paddlenlp/transformers/electra/modeling.py      |  8 ++++++--
 paddlenlp/transformers/funnel/modeling.py       |  4 ++--
 paddlenlp/transformers/gptj/modeling.py         |  2 +-
 paddlenlp/transformers/mbart/modeling.py        |  2 +-
 paddlenlp/transformers/megatronbert/modeling.py |  2 +-
 paddlenlp/transformers/prophetnet/modeling.py   | 16 ++++++----------
 paddlenlp/transformers/rembert/modeling.py      |  2 +-
 12 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py
index 625b81d765ff..f5abb5e25604 100644
--- a/paddlenlp/generation/utils.py
+++ b/paddlenlp/generation/utils.py
@@ -511,7 +511,9 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
     def update_scores_for_generation(scores, next_scores, length, unfinished_flag):
         # update scores
 
-        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        unfinished_scores = (scores * paddle.to_tensor(length, dtype=scores.dtype) + next_scores) / (
+            paddle.to_tensor(length, dtype=scores.dtype) + 1
+        )
         scores = paddle.where(unfinished_flag, unfinished_scores, scores)
         return scores
 
diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
index aaaec528ca5f..fb562653426f 100644
--- a/paddlenlp/layers/crf.py
+++ b/paddlenlp/layers/crf.py
@@ -165,7 +165,7 @@ def _point_score(self, inputs, labels, lengths):
         flattened_inputs = inputs.reshape([-1])
         offsets = paddle.unsqueeze(self._get_batch_index(batch_size) * seq_len * n_labels, 1)
         offsets += paddle.unsqueeze(self._get_seq_index(seq_len) * n_labels, 0)
-        flattened_tag_indices = paddle.reshape(offsets + labels, [-1])
+        flattened_tag_indices = paddle.reshape(offsets + labels.astype(offsets.dtype), [-1])
 
         scores = paddle.gather(flattened_inputs, flattened_tag_indices).reshape([batch_size, seq_len])
 
diff --git a/paddlenlp/metrics/perplexity.py b/paddlenlp/metrics/perplexity.py
index 905518f36db9..a785d3780561 100644
--- a/paddlenlp/metrics/perplexity.py
+++ b/paddlenlp/metrics/perplexity.py
@@ -92,7 +92,7 @@ def compute(self, pred, label, seq_mask=None):
         ce = F.cross_entropy(input=pred, label=label, reduction="none", soft_label=False)
         ce = paddle.squeeze(ce, axis=[2])
         if seq_mask is not None:
-            ce = ce * seq_mask
+            ce = ce * seq_mask.astype(ce.dtype)
             word_num = paddle.sum(seq_mask)
             return ce, word_num
         return ce
diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py
index 637a37001559..174a863808b6 100644
--- a/paddlenlp/prompt/verbalizer.py
+++ b/paddlenlp/prompt/verbalizer.py
@@ -162,7 +162,7 @@ def aggregate(self, outputs: Tensor, mask: Tensor, atype: str):
         Aggregate multiple tokens/words for each word/label.
         """
         if atype == "mean":
-            outputs = outputs * mask
+            outputs = outputs * mask.astype(outputs.dtype)
             outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15)
         elif atype == "max":
             outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1)
diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py
index d5ec8e843c2a..c9884e5a7383 100644
--- a/paddlenlp/transformers/convbert/modeling.py
+++ b/paddlenlp/transformers/convbert/modeling.py
@@ -1137,7 +1137,9 @@ def update_inputs(self, sequence, updates, positions):
         N = positions.shape[1]
         assert N == L, "the dimension of inputs and mask should be same as [batch_size, sequence_length]"
 
-        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
 
         return updated_sequence
 
diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
index b3b0b67c1a3d..03412fd7e39e 100644
--- a/paddlenlp/transformers/electra/modeling.py
+++ b/paddlenlp/transformers/electra/modeling.py
@@ -1051,7 +1051,9 @@ def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generat
         mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
         updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions)
         # use inputs and updated_input to get discriminator labels
-        labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype("int64"))
+        labels = mask_positions * (
+            paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype(raw_inputs.dtype)
+        )
         return updated_inputs, labels, sampled_tokids
 
     def sample_from_softmax(self, logits, use_softmax_sample=True):
@@ -1073,7 +1075,9 @@ def update_inputs(self, sequence, updates, positions):
         N = positions.shape[1]
         assert N == L, "the dimension of inputs and mask should be same as [B, L]"
 
-        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
 
         return updated_sequence
 
diff --git a/paddlenlp/transformers/funnel/modeling.py b/paddlenlp/transformers/funnel/modeling.py
index 5952363a44b1..7dc097ef68e0 100644
--- a/paddlenlp/transformers/funnel/modeling.py
+++ b/paddlenlp/transformers/funnel/modeling.py
@@ -519,7 +519,7 @@ def relative_positional_attention(self, position_embeds, q_head, context_len, cl
             positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
 
         if cls_mask is not None:
-            positional_attn *= cls_mask
+            positional_attn *= cls_mask.astype(positional_attn.dtype)
         return positional_attn
 
     def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
@@ -547,7 +547,7 @@ def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
         )
 
         if cls_mask is not None:
-            token_type_attn *= cls_mask
+            token_type_attn *= cls_mask.astype(token_type_attn.dtype)
         return token_type_attn
 
     def forward(self, query, key, value, attention_inputs, output_attentions=False):
diff --git a/paddlenlp/transformers/gptj/modeling.py b/paddlenlp/transformers/gptj/modeling.py
index 86207866a5dd..df8ea5e7f1e2 100644
--- a/paddlenlp/transformers/gptj/modeling.py
+++ b/paddlenlp/transformers/gptj/modeling.py
@@ -158,7 +158,7 @@ def _attn(
 
         if attention_mask is not None:
             # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
+            attn_weights = attn_weights + attention_mask.astype(attn_weights.dtype)
 
         attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1)
         attn_weights = attn_weights.astype(value.dtype)
diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
index d401554fde3d..28c4d577ebd7 100644
--- a/paddlenlp/transformers/mbart/modeling.py
+++ b/paddlenlp/transformers/mbart/modeling.py
@@ -63,7 +63,7 @@ def shift_tokens_right(input_ids, pad_token_id):
     batch_size, seq_length = paddle.shape(shifted_input_ids)
     index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length
     index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1
-    decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos)
+    decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos.astype(index.dtype))
     shifted_input_ids[:, 1:] = shifted_input_ids[:, :-1].clone()
     shifted_input_ids[:, 0] = decoder_start_tokens
     return shifted_input_ids
diff --git a/paddlenlp/transformers/megatronbert/modeling.py b/paddlenlp/transformers/megatronbert/modeling.py
index 6536080cd982..85f002d84b2a 100644
--- a/paddlenlp/transformers/megatronbert/modeling.py
+++ b/paddlenlp/transformers/megatronbert/modeling.py
@@ -171,7 +171,7 @@ def forward(self, hidden_states, attention_mask=None):
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
-            attention_scores = attention_scores + attention_mask
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, axis=-1)
diff --git a/paddlenlp/transformers/prophetnet/modeling.py b/paddlenlp/transformers/prophetnet/modeling.py
index 9c251078f8c4..0baf3a7b36c5 100644
--- a/paddlenlp/transformers/prophetnet/modeling.py
+++ b/paddlenlp/transformers/prophetnet/modeling.py
@@ -71,12 +71,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
         )
         inv_relative_positions = paddle.abs(inv_relative_positions)
     else:
-        inv_relative_positions = (
-            paddle.cast(
-                paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
-            )
-            * inv_relative_positions
-        )
+        inv_relative_positions = paddle.cast(
+            paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
+        ) * inv_relative_positions.astype(paddle.int32)
 
     max_exact = num_buckets // 2
     is_small = paddle.less_than(inv_relative_positions, paddle.to_tensor(max_exact).cast(dtype=paddle.int32))
@@ -85,10 +82,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
     ) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
     val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1)
     val_if_large_lt = paddle.cast(paddle.less_than(val_if_large, val_if_large_num_buckets), dtype=paddle.int32)
-    val_if_large = (
-        paddle.cast(val_if_large_lt * val_if_large, dtype=paddle.int32)
-        + (1 - val_if_large_lt) * val_if_large_num_buckets
-    )
+    val_if_large = val_if_large_lt * val_if_large.astype(val_if_large_lt.dtype) + (
+        1 - val_if_large_lt
+    ) * val_if_large_num_buckets.astype(val_if_large_lt.dtype)
     rel_positions_bucket = rel_positions_bucket + paddle.where(
         is_small, paddle.cast(inv_relative_positions, dtype=paddle.int32), val_if_large
     )
diff --git a/paddlenlp/transformers/rembert/modeling.py b/paddlenlp/transformers/rembert/modeling.py
index 7fa30229e316..c4697253e7ff 100644
--- a/paddlenlp/transformers/rembert/modeling.py
+++ b/paddlenlp/transformers/rembert/modeling.py
@@ -150,7 +150,7 @@ def forward(self, hidden_states, attention_mask=None):
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in RemBertModel forward() function)
-            attention_scores = attention_scores + attention_mask
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
 
         # Normalize the attention scores to probabilities.
         attention_probs = F.softmax(attention_scores, axis=-1)

From 8879f79f9857dc7831403064631ae32b0a0def23 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 23 May 2024 21:09:14 +0800
Subject: [PATCH 18/27] quick fix from pretrained. (#8487)

---
 paddlenlp/transformers/model_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index dc1c753206c4..9c9af9bbc694 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -798,7 +798,7 @@ def _load_state_dict_into_meta_model(
 
     dtype = convert_np_dtype_to_dtype_(dtype)
     error_msgs = []
-
+    model_state_dict = model.state_dict()
     for param_name, param in state_dict.items():
         # First part of the test is always true as loaded_state_dict_keys always contains state_dict keys.
         if param_name not in loaded_state_dict_keys or param_name not in expected_keys:
@@ -833,7 +833,7 @@ def _load_state_dict_into_meta_model(
             if old_param is not None:
                 param = param.astype(dtype=old_param.dtype)
         with paddle.no_grad():
-            model.state_dict()[param_name].get_tensor()._share_data_with(param.value().get_tensor())
+            model_state_dict[param_name].get_tensor()._share_data_with(param.value().get_tensor())
             param.value().get_tensor()._clear()
     return error_msgs
 
@@ -1890,7 +1890,7 @@ def _find_mismatched_keys(
                 if (
                     shard_file.endswith(".safetensors")
                     and config.tensor_parallel_degree > 1
-                    and "tp" not in shard_file
+                    and "tp" not in os.path.spilt(shard_file)[-1]
                 ):
                     pre_tensor_parallel_split = True
                     assert loaded_keys is not None, "loaded_keys is not None."

From bbf945b64ab611e491c429ef86887cf84f43d3a5 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Fri, 24 May 2024 13:43:46 +0800
Subject: [PATCH 19/27] Release/2.8 (#8437)

* [XPU] llama add xpu support (#8282)

* [XPU] llama add xpu support

* fix

* use try import

* fix

* refine

* refine

* refine

* refine

* update (#8399)

* [LLM] Support fuse attention q, k, v weights  (#8202)

1. add use-interface & fuse action

1.1. modify 1., code order

2. switch to name_mapping

3. solve tp branch

3.2 follow hui, handel qkv separately

3.3 handle pdparams

3.4 from torch

3.5 abandon low_cpu_mem_usage

3.6 solve shard branch

* 3.6.1 solve shard branch after rebase develop

* code clean

* remove debug comment

* Redefine fuse and split functions

* Redefine fuse and split functions

* comment and fix

* update method

* update QKV fuse and split

* support fuse weights in multi-files

* add precision compare

* simplify function call

* support use_fast_ffn

* clean modeling and configuration

* add test for gpt and opt

* fix tp_actions get

* add fast_ffn test

* add Qwen2Moe

* Revert "add Qwen2Moe"

This reverts commit 113b8838a7c53f1d131928c30bf1071dfa583445.

* add test for split

* update doc

* update filter_dict_keys

---------

Co-authored-by: Zii <ziangqin.baidu@gmail.com>

* [LLM] Fix fuse or split with same key (#8378)

* fix fuse or split with same key

* fix

* fix eps

* update format

* [LLM] add decay steps option for finetuning (#8251)

* [LLM] add memory stats to logger of trainer (#8269)

* [Distributed] fix lora (#8325)

* [LLM] fix lora target modules on llama (#8372)

* [Distributed] metric calculation supports tp logits (#8370)

* Update model_utils.py

* Update model_utils.py

* Update model_utils.py

---------

Co-authored-by: Jianbang Yang <yangjianbang112@gmail.com>
Co-authored-by: DrownFish19 <DrownFish19@gmail.com>
Co-authored-by: Zii <ziangqin.baidu@gmail.com>
Co-authored-by: Tian <121000916+SylarTiaNII@users.noreply.github.com>
---
 llm/finetune_generation.py                   |   6 +-
 llm/run_pretrain.py                          |  11 +
 llm/utils.py                                 |   9 +
 paddlenlp/peft/lora/lora_layers.py           |   2 +-
 paddlenlp/trainer/trainer.py                 |  18 +-
 paddlenlp/trainer/training_args.py           |   4 +
 paddlenlp/transformers/conversion_utils.py   | 247 ++++++++++++++++++
 paddlenlp/transformers/gpt/modeling.py       |  43 ++++
 paddlenlp/transformers/gpt/modeling_pp.py    |   1 +
 paddlenlp/transformers/linear_utils.py       |  59 +++++
 paddlenlp/transformers/llama/modeling.py     | 155 ++++++++---
 paddlenlp/transformers/llama/modeling_pp.py  |   1 +
 paddlenlp/transformers/model_utils.py        |  71 ++++-
 paddlenlp/transformers/opt/configuration.py  |   5 +
 paddlenlp/transformers/opt/modeling.py       |  43 ++++
 tests/transformers/test_conversion_common.py | 258 +++++++++++++++++++
 16 files changed, 886 insertions(+), 47 deletions(-)
 create mode 100644 paddlenlp/transformers/linear_utils.py
 create mode 100644 tests/transformers/test_conversion_common.py

diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py
index df7a22a0cb95..c8fed17165af 100644
--- a/llm/finetune_generation.py
+++ b/llm/finetune_generation.py
@@ -140,7 +140,7 @@ def main():
         if not training_args.autotuner_benchmark:
             model = AutoModelForCausalLMPipe.from_pretrained(
                 model_args.model_name_or_path,
-                tensor_parallel_output=False,
+                tensor_parallel_output=training_args.tensor_parallel_output,
                 tensor_parallel_degree=training_args.tensor_parallel_degree,
                 tensor_parallel_rank=training_args.tensor_parallel_rank,
                 use_flash_attention=model_args.use_flash_attention,
@@ -152,7 +152,7 @@ def main():
             # NOTE(gongenlei): new add autotuner_benchmark
             model_config = AutoConfig.from_pretrained(
                 model_args.model_name_or_path,
-                tensor_parallel_output=False,
+                tensor_parallel_output=training_args.tensor_parallel_output,
                 tensor_parallel_degree=training_args.tensor_parallel_degree,
                 tensor_parallel_rank=training_args.tensor_parallel_rank,
                 dtype=dtype,
@@ -163,7 +163,7 @@ def main():
     else:
         model_config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
-            tensor_parallel_output=False,
+            tensor_parallel_output=training_args.tensor_parallel_output,
             tensor_parallel_degree=training_args.tensor_parallel_degree,
             tensor_parallel_rank=training_args.tensor_parallel_rank,
             dtype=dtype,
diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py
index d0df32321e18..7196f52eea6d 100644
--- a/llm/run_pretrain.py
+++ b/llm/run_pretrain.py
@@ -46,6 +46,7 @@
 )
 from paddlenlp.utils.batch_sampler import DistributedBatchSampler
 from paddlenlp.utils.log import logger
+from paddlenlp.utils.tools import get_env_device
 
 
 def add_start_docstrings(*docstr):
@@ -483,6 +484,16 @@ def main():
         config.num_attention_heads % config.sep_parallel_degree == 0
     ), f"num_attention_heads:{config.num_attention_heads} must be divisible by sep_parallel_degree {config.sep_parallel_degree}"
 
+    if get_env_device() == "xpu" and training_args.gradient_accumulation_steps > 1:
+        try:
+            from paddle_xpu.layers.nn.linear import LinearConfig  # noqa: F401
+
+            LinearConfig.enable_accumulate_steps_opt()
+            LinearConfig.set_accumulate_steps(training_args.gradient_accumulation_steps)
+        except ImportError:
+            # It's OK, not use accumulate_steps optimization
+            pass
+
     print("Final pre-training config:", config)
 
     # Set the dtype for loading model
diff --git a/llm/utils.py b/llm/utils.py
index 8bcc52ae33ab..6688357bd67b 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -125,9 +125,11 @@ def get_lora_target_modules(model):
             ".*v_proj.*",
             ".*k_proj.*",
             ".*o_proj.*",
+            ".*qkv_proj.*",
             ".*gate_proj.*",
             ".*down_proj.*",
             ".*up_proj.*",
+            ".*gate_up_fused_proj.*",
         ]
     elif model.base_model_prefix == "opt":
         target_modules = [
@@ -209,6 +211,13 @@ def prediction_step(
             # keepdim in order to maintain the same shape as logits
             if isinstance(logits, (list, tuple)):
                 logits = logits[0]
+            # all gather logits when enabling tensor_parallel_output
+            if self.args.tensor_parallel_degree > 1 and self.args.tensor_parallel_output:
+                hcg = fleet.get_hybrid_communicate_group()
+                model_parallel_group = hcg.get_model_parallel_group()
+                gathered_logits = []
+                dist.all_gather(gathered_logits, logits, group=model_parallel_group)
+                logits = paddle.concat(gathered_logits, axis=-1)
             return (loss, logits.argmax(axis=-1, keepdim=True), labels)
 
         loss = None
diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
index 7ac40ed0ba66..73120060fe87 100644
--- a/paddlenlp/peft/lora/lora_layers.py
+++ b/paddlenlp/peft/lora/lora_layers.py
@@ -539,7 +539,7 @@ def forward(self, input: paddle.Tensor):
                 result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name)
             else:
                 res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group)
-                result_mp = res_mp + self.bias
+                result_mp = (res_mp + self.bias) if self.bias is not None else res_mp
 
             if not self.merged:
                 input_a = self.lora_dropout(input) @ self.lora_A
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index 746b7e252516..f507b5c8b92f 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -39,6 +39,8 @@
 import paddle.distributed as dist
 import paddle.nn as nn
 from packaging import version
+from paddle import framework
+from paddle.base import core
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
     HybridParallelOptimizer,
@@ -1257,6 +1259,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
             logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
             logs["global_step"] = int(self.state.global_step)
 
+            divisor = 2**30
+            # TODO(@gexiao): replace these codes with unified APIs in Paddle
+            current_device = framework._current_expected_place_()
+            if str(current_device) != "Place(cpu)":
+                device_id = current_device.get_device_id()
+                current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
+                current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
+                max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
+                max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
+                logs["current_memory_allocated"] = current_memory_allocated / divisor
+                logs["current_memory_reserved"] = current_memory_reserved / divisor
+                logs["max_memory_allocated"] = max_memory_allocated / divisor
+                logs["max_memory_reserved"] = max_memory_reserved / divisor
+
             total_train_batch_size = (
                 self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
             )
@@ -1614,8 +1630,6 @@ def _load_rng_state(self, checkpoint):
         random.setstate(checkpoint_rng_state["python"])
         np.random.set_state(checkpoint_rng_state["numpy"])
 
-        core = paddle.framework.core
-
         core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
         if core.is_compiled_with_cuda():
             if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index 2ed9d343ceaa..3118178608d2 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -787,6 +787,10 @@ class TrainingArguments:
         default=False,
         metadata={"help": "whether to run distributed training in auto parallel mode"},
     )
+    tensor_parallel_output: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to output logits in distributed status"},
+    )
 
     def __post_init__(self):
         env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py
index ba5169454d0b..6ea6afaad80a 100644
--- a/paddlenlp/transformers/conversion_utils.py
+++ b/paddlenlp/transformers/conversion_utils.py
@@ -499,6 +499,118 @@ def splited_qkv_to_tensor_parallel_qkv(weight_list, num_attention_heads):
     return naive_merged_qkv_to_tensor_parallel_qkv(weight)
 
 
+def fuse_param_func():
+    def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None):
+        """fuse function for fusing weights
+
+        (1) fuse_attention_qkv
+            q => [q1,q2,q3,q4]
+            k => [k1,k2,k3,k4] or [k1,k2] for GQA
+            v => [v1,v2,v3,v4] or [v1,v2] for GQA
+            fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
+                 or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
+        (2) fuse_attention_ffn
+            directly fuse weights to 1 parts
+            [gate_weight], [up_weight] => [gate_weight, up_weight]
+
+        Args:
+            fuse_params (_type_): to be fused weights
+            is_qkv (bool, optional): for attention qkv weights. Defaults to False.
+            num_heads (_type_, optional): query heads. Defaults to None.
+            num_key_value_heads (_type_, optional): key and value heads. Defaults to None.
+
+        Returns:
+            _type_: fused weights
+        """
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fuse_params[0], paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+
+        if is_qkv:
+            # fuse_attention_qkv
+            assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            assert (
+                len(fuse_params) == 3
+            ), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list = split_fn(fuse_params[0], num_heads, axis=-1)
+            k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1)
+            v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1)
+
+            qkv_pairs = []
+            for i in range(num_key_value_heads):
+                qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups]
+                qkv_pairs.append(k_list[i])
+                qkv_pairs.append(v_list[i])
+            return concat_fn(qkv_pairs, axis=-1)
+        else:
+            # fuse_attention_ffn
+            return concat_fn(fuse_params, axis=-1)
+
+    return fn
+
+
+def split_param_func():
+    def fn(fused_param, split_nums=2, is_qkv=False, num_heads=None, num_key_value_heads=None):
+        """split function for splitting weights
+
+        (1) fuse_attention_qkv
+            fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
+                 or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
+            after split
+            q => [q1,q2,q3,q4]
+            k => [k1,k2,k3,k4] or [k1,k2] for GQA
+            v => [v1,v2,v3,v4] or [v1,v2] for GQA
+        (2) fuse_attention_ffn
+            directly split weight to 2 parts
+            [gate_weight, up_weight] => [gate_weight], [up_weight]
+
+        Args:
+            fused_param (_type_): len(fused_param)=1, only one weight to be splitted
+            split_nums (int, optional): split_nums. Defaults to 2.
+            is_qkv (bool, optional): for attention qkv weights. Defaults to False.
+            num_heads (_type_, optional): query heads. Defaults to None.
+            num_key_value_heads (_type_, optional): key and value heads. Defaults to None.
+
+        Returns:
+            _type_: splitted weights
+        """
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fused_param, paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+
+        if is_qkv:
+            # fuse_attention_qkv
+            assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list, k_list, v_list = [], [], []
+            split_heads = split_fn(fused_param, num_heads + 2 * num_key_value_heads, axis=-1)
+            for i in range(num_key_value_heads):
+                q_list += split_heads[i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2]
+                k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2])
+                v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1])
+            return concat_fn(q_list, axis=-1), concat_fn(k_list, axis=-1), concat_fn(v_list, axis=-1)
+        else:
+            # fuse_attention_ffn
+            return split_fn(fused_param, split_nums, axis=-1)
+
+    return fn
+
+
+def split_or_fuse_func(is_fuse=True):
+    return fuse_param_func() if is_fuse else split_param_func()
+
+
 def get_tensor_parallel_merge_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads=None):
     def fn(
         x,
@@ -1110,6 +1222,7 @@ def convert_tensor_parallel(
             weight_file (str | None): the weight file path of `model_state.pdparams` file
             config (PretrainedConfig): the PretrainedConfig instance of model
         """
+
         name_action_mappings = cls._get_tensor_parallel_mappings(config)
         if state_dict is None:
             with device_guard("cpu"):
@@ -1211,6 +1324,140 @@ def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_error=False):
 
         return state_keys_map
 
+    @classmethod
+    def convert_fuse_and_split(cls, config: PretrainedConfig, state_dict, tp_actions=None):
+        loaded_keys = state_dict.keys()
+        # collect and convert fuse/split action
+        fused_and_split_keys = []
+        convert_with_same_keys = []
+        fuse_actions, resume_keys = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True)
+        for keys, action in fuse_actions.items():
+            if keys[-1] in keys[:-1]:
+                assert len(keys) == 2, "only 2 keys can be converted with the same name"
+                convert_with_same_keys.append(keys[-1])
+            origin_states = [state_dict.pop(key) for key in keys[:-1]]
+            state_dict[keys[-1]] = action(origin_states)
+            fused_and_split_keys.append(keys[-1])
+            logger.debug(f"Fusing parameter: {keys[:-1]} into {keys[-1]}")
+
+        split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False)
+        for keys, action in split_actions.items():
+            if keys[-1] in keys[:-1]:
+                assert len(keys) == 2, "only 2 keys can be converted with the same name"
+                convert_with_same_keys.append(keys[-1])
+            origin_state = state_dict.pop(keys[-1])
+            split_states = action(origin_state)
+            for key_idx, key in enumerate(keys[:-1]):
+                state_dict[key] = split_states[key_idx]
+                fused_and_split_keys.append(key)
+            logger.debug(f"Splitting parameter: {keys[-1]} into {keys[:-1]}")
+
+        if tp_actions is not None:
+            for key in fused_and_split_keys:
+                if key in convert_with_same_keys:
+                    continue
+
+                for name in tp_actions.keys():
+                    if key.endswith(name):
+                        with device_guard():
+                            state_dict[key] = paddle.Tensor(tp_actions[name](state_dict.pop(key)), zero_copy=True)
+                        break
+
+        # when shard file split the weight as follows, some weights need to be resumed for next shard file
+        # shard-001-file: q_weight, k_weight
+        # shard_002-file: v_weight
+        resume_state_dict = {k: state_dict[k] for k in resume_keys if k in state_dict}
+        return state_dict, resume_state_dict
+
+    @classmethod
+    def get_fuse_or_split_param_convert_actions(
+        cls,
+        config: PretrainedConfig,
+        loaded_state_dict_keys,
+        is_fuse=True,
+        ignore_error=False,
+    ):
+        name_action_mappings = cls._get_fuse_or_split_param_mappings(config, is_fuse)
+        state_keys_map = cls._resolve_prefix_keys_for_fuse_and_split(
+            name_action_mappings.keys(), loaded_state_dict_keys, ignore_error, is_fuse
+        )
+        for k, v in state_keys_map.items():
+            name_action_mappings[v] = name_action_mappings.pop(k)
+
+        # filter name_action_mappings with corresponding weights
+        # fusing: verify all of the keys in name_action_mappings are in loaded_state_dict_keys
+        # splitting: verify the last key in name_action_mappings is in loaded_state_dict_keys
+        filter_name_action = {}
+        resume_keys = []
+        if is_fuse:
+            for k, v in name_action_mappings.items():
+                cond = True
+                if not all(item in loaded_state_dict_keys for item in k[:-1]):
+                    # resume keys for next fuse
+                    resume_keys += k[:-1]
+                    cond = False
+                if cond:
+                    filter_name_action[k] = v
+        else:
+            for k, v in name_action_mappings.items():
+                if k[-1] in loaded_state_dict_keys:
+                    filter_name_action[k] = v
+
+        return filter_name_action, resume_keys
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: PretrainedConfig, is_fuse=True) -> List[StateDictNameMapping]:
+        """get fused parameter mapping of PretrainedModel
+
+        Args:
+            config (PretrainedConfig): the configuration of name-mapping
+
+        Raises:
+            NotImplementedError:
+
+        Returns:
+            List[StateDictNameMapping]: the name-mappings for tensor_parallel
+        """
+        # raise NotImplementedError(
+        #     f"`_get_fuse_or_split_param_mappings` is not implemented for {cls.__name__}`. To implement it, you should "
+        #     f"overwrite this method in the class {cls.__name__} in `{cls.__module__}.py`"
+        # )
+        return {}
+
+    @staticmethod
+    def _resolve_prefix_keys_for_fuse_and_split(state_keys_base, state_keys_real, ignore_error=False, is_fuse=True):
+        state_keys_map = {}
+
+        # use the tuple (x1,x2,x3,x4) as one key, and the prefix of x1,x2,x3 is used as a new key x4 or
+        # the last key x4 is used as new keys x1,x2,x3. And, the tuple also could be (a) (x1, x1) -> convert x1 to x1;
+        # (b) (x1,x2,x3) -> fuse x1 and x2 to x3; (c) (x1,x2,x3,x4) -> fuse x1, x2 and x3 to x4.
+
+        # is_fuse: True -> fuse, False -> split
+        # True: (x1,x2,x3,x4) -> [x1,x2,x3] are exist in state_keys_real, x4 is not exist in state_keys_real
+        # False: (x1,x2,x3,x4) -> [x1,x2,x3] are not exist in state_keys_real, x4 is exist in state_keys_real
+
+        for keys in state_keys_base:
+            prefix = ""
+            if is_fuse:
+                for x in state_keys_real:
+                    for base_key in keys[:-1]:
+                        if x.endswith(base_key):
+                            prefix = x.replace(base_key, "")
+                            break
+                    if prefix != "":
+                        break
+            else:
+                base_key = keys[-1]
+                for x in state_keys_real:
+                    if x.endswith(base_key):
+                        prefix = x.replace(base_key, "")
+                        break
+
+            new_keys = tuple([prefix + key for key in keys])
+            state_keys_map[keys] = new_keys
+
+        return state_keys_map
+
 
 class Converter(ConversionMixin, LogitComparer):
     """some converters are implemented in ppdiffusers, so if remove it directly, it will make ppdiffusers down.
diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
index 8c066431979f..4a018347f80c 100644
--- a/paddlenlp/transformers/gpt/modeling.py
+++ b/paddlenlp/transformers/gpt/modeling.py
@@ -844,6 +844,49 @@ def get_tensor_parallel_split_mappings(num_layers):
 
         return mappings
 
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: GPTConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "decoder.layers.0.self_attn.q_proj.weight",
+            "decoder.layers.0.self_attn.k_proj.weight",
+            "decoder.layers.0.self_attn.v_proj.weight",
+            "decoder.layers.0.self_attn.qkv_proj.weight",
+        )
+        fuse_qkv_bias_keys = (
+            "decoder.layers.0.self_attn.q_proj.bias",
+            "decoder.layers.0.self_attn.k_proj.bias",
+            "decoder.layers.0.self_attn.v_proj.bias",
+            "decoder.layers.0.self_attn.qkv_proj.bias",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        return final_actions
+
     @classmethod
     def _get_name_mappings(cls, config: GPTConfig) -> list[StateDictNameMapping]:
         mappings: list[StateDictNameMapping] = []
diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py
index cd3dce018378..8b350e6556df 100644
--- a/paddlenlp/transformers/gpt/modeling_pp.py
+++ b/paddlenlp/transformers/gpt/modeling_pp.py
@@ -161,6 +161,7 @@ class GPTForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
     config_class = GPTConfig
 
     _get_tensor_parallel_mappings = GPTPretrainedModel._get_tensor_parallel_mappings
+    _get_fuse_or_split_param_mappings = GPTPretrainedModel._get_fuse_or_split_param_mappings
     _init_weights = GPTPretrainedModel._init_weights
 
     pretrained_init_configuration = GPTPretrainedModel.pretrained_init_configuration
diff --git a/paddlenlp/transformers/linear_utils.py b/paddlenlp/transformers/linear_utils.py
new file mode 100644
index 000000000000..de1a0f886b79
--- /dev/null
+++ b/paddlenlp/transformers/linear_utils.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file is used for replacing Paddle's native Linear implementations with vendors' customized implementations
+"""
+
+import paddle.distributed.fleet.meta_parallel as mpu
+from paddle import nn
+from paddle.distributed.fleet.utils import sequence_parallel_utils
+
+from paddlenlp.transformers.mc2_parallel_linear import (
+    MC2ColumnSeqParallelLinear,
+    MC2RowSeqParallelLinear,
+)
+from paddlenlp.utils.tools import get_env_device
+
+Linear = nn.Linear
+ColumnParallelLinear = mpu.ColumnParallelLinear
+RowParallelLinear = mpu.RowParallelLinear
+ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear
+RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear
+
+if get_env_device() == "npu":
+    if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
+        ColumnSequenceParallelLinear = MC2ColumnSeqParallelLinear
+        RowSequenceParallelLinear = MC2RowSeqParallelLinear
+elif get_env_device() == "xpu":
+    try:
+        from paddle_xpu.layers.nn import ColumnParallelLinear as XPUColumnParallelLinear
+        from paddle_xpu.layers.nn import Linear as XPULinear
+        from paddle_xpu.layers.nn import RowParallelLinear as XPURowParallelLinear
+        from paddle_xpu.layers.nn.sequence_parallel import (
+            XPUColumnSequenceParallelLinear,
+            XPURowSequenceParallelLinear,
+        )
+
+        Linear = XPULinear
+        ColumnParallelLinear = XPUColumnParallelLinear
+        RowParallelLinear = XPURowParallelLinear
+        ColumnSequenceParallelLinear = XPUColumnSequenceParallelLinear
+        RowSequenceParallelLinear = XPURowSequenceParallelLinear
+    except ImportError:
+        # If paddle_xpu is not installed, just use Paddle's native Linear implementations
+        pass
+else:
+    # By default, use Paddle's native Linear implementations
+    pass
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
index 38f1d244bdf2..97cf780e2447 100755
--- a/paddlenlp/transformers/llama/modeling.py
+++ b/paddlenlp/transformers/llama/modeling.py
@@ -62,10 +62,6 @@ def swiglu(x, y=None):
     init_name_mappings,
 )
 from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
-from paddlenlp.transformers.mc2_parallel_linear import (
-    MC2ColumnSeqParallelLinear,
-    MC2RowSeqParallelLinear,
-)
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -74,6 +70,8 @@ def swiglu(x, y=None):
 from paddlenlp.utils.log import logger
 from paddlenlp.utils.tools import get_env_device
 
+from .. import linear_utils
+from ..linear_utils import Linear
 from ..segment_parallel_utils import ReshardLayer
 from .configuration import (
     LLAMA_PRETRAINED_INIT_CONFIGURATION,
@@ -211,6 +209,7 @@ def scaled_dot_product_attention(
     alibi=None,
     sequence_parallel=False,
     reshard_layer=None,
+    npu_is_casual=False,
 ):
     bsz, q_len, num_heads, head_dim = query_states.shape
     _, kv_seq_len, _, _ = value_states.shape
@@ -410,6 +409,15 @@ def forward(self, hidden_states):
         if self.config.use_fused_rms_norm:
             if get_env_device() == "npu":
                 return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0]
+            elif get_env_device() == "xpu":
+                try:
+                    import paddle_xpu_nn  # noqa: F821
+
+                    return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
+                except ImportError:
+                    raise NotImplementedError(
+                        f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+                    )
             return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
 
         if paddle.in_dynamic_mode():
@@ -571,15 +579,11 @@ def __init__(self, config):
         self.fuse_attention_ffn = config.fuse_attention_ffn
 
         if config.sequence_parallel:
-            if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
-                ColumnParallelLinear = MC2ColumnSeqParallelLinear
-                RowParallelLinear = MC2RowSeqParallelLinear
-            else:
-                ColumnParallelLinear = ColumnSequenceParallelLinear
-                RowParallelLinear = RowSequenceParallelLinear
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
         else:
-            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
-            RowParallelLinear = fleet.meta_parallel.RowParallelLinear
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
 
         if config.tensor_parallel_degree > 1:
             if config.fuse_attention_ffn:
@@ -611,15 +615,29 @@ def __init__(self, config):
             )
         else:
             if config.fuse_attention_ffn:
-                self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
+                self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
             else:
-                self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
-                self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+                self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+                self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
 
-            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+            self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
 
     def forward(self, x):
         if self.fuse_attention_ffn:
+            # FIXME(yangjianbang): use paddle's native swiglu
+            if get_env_device() == "xpu":
+                try:
+                    import paddle_xpu_nn  # noqa: F821
+
+                    out = self.gate_up_fused_proj(x)
+                    out = paddle_xpu_nn.xpu_swiglu(out, axis=-1, turn=True)
+                    out = self.down_proj(out)
+                    return out
+                except ImportError:
+                    gate_out, up_out = paddle.chunk(self.gate_up_fused_proj(x), chunks=2, axis=-1)
+                    out = self.down_proj(F.silu(gate_out) * up_out)
+                    return out
+
             x = swiglu(self.gate_up_fused_proj(x))
         else:
             x = swiglu(self.gate_proj(x), self.up_proj(x))
@@ -680,7 +698,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                 )
 
         self.use_fused_rope = config.use_fused_rope
-        if self.use_fused_rope and get_env_device() != "npu":
+        if self.use_fused_rope and get_env_device() not in ["npu", "xpu"]:
             if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
                 warnings.warn(
                     "Enable fuse rope in the config, but fuse rope is not available. "
@@ -689,15 +707,11 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                 self.use_fused_rope = False
 
         if config.sequence_parallel:
-            if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
-                ColumnParallelLinear = MC2ColumnSeqParallelLinear
-                RowParallelLinear = MC2RowSeqParallelLinear
-            else:
-                ColumnParallelLinear = ColumnSequenceParallelLinear
-                RowParallelLinear = RowSequenceParallelLinear
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
         else:
-            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
-            RowParallelLinear = fleet.meta_parallel.RowParallelLinear
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
 
         if config.tensor_parallel_degree > 1:
             if self.fuse_attention_qkv:
@@ -728,12 +742,12 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                         gather_output=False,
                     )
                 else:
-                    self.k_proj = nn.Linear(
+                    self.k_proj = Linear(
                         self.hidden_size,
                         self.config.num_key_value_heads * self.head_dim,
                         bias_attr=False,
                     )
-                    self.v_proj = nn.Linear(
+                    self.v_proj = Linear(
                         self.hidden_size,
                         self.config.num_key_value_heads * self.head_dim,
                         bias_attr=False,
@@ -741,23 +755,23 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
 
         else:
             if self.fuse_attention_qkv:
-                self.qkv_proj = nn.Linear(
+                self.qkv_proj = Linear(
                     self.hidden_size,
                     self.hidden_size + 2 * self.config.num_key_value_heads * self.head_dim,
                     bias_attr=False,
                 )
             else:
-                self.q_proj = nn.Linear(
+                self.q_proj = Linear(
                     self.hidden_size,
                     self.hidden_size,
                     bias_attr=False,
                 )
-                self.k_proj = nn.Linear(
+                self.k_proj = Linear(
                     self.hidden_size,
                     self.config.num_key_value_heads * self.head_dim,
                     bias_attr=False,
                 )
-                self.v_proj = nn.Linear(
+                self.v_proj = Linear(
                     self.hidden_size,
                     self.config.num_key_value_heads * self.head_dim,
                     bias_attr=False,
@@ -771,7 +785,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
                 input_is_parallel=True,
             )
         else:
-            self.o_proj = nn.Linear(
+            self.o_proj = Linear(
                 self.hidden_size,
                 self.hidden_size,
                 bias_attr=False,
@@ -835,6 +849,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
@@ -1062,6 +1077,7 @@ def forward(
                 alibi,
                 self.sequence_parallel,
                 reshard_layer=self.reshard_layer,
+                npu_is_casual=npu_is_casual,
             )
         if output_attentions:
             attn_output, attn_weights = outputs
@@ -1114,6 +1130,7 @@ def forward(
         past_key_value: Optional[Tuple[paddle.Tensor]] = None,
         use_cache: Optional[bool] = False,
         alibi: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
     ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         """
         Args:
@@ -1161,6 +1178,7 @@ def forward(
                 output_attentions,
                 use_cache,
                 alibi,
+                npu_is_casual=npu_is_casual,
             )
 
         if type(outputs) is tuple:
@@ -1293,6 +1311,56 @@ def get_tensor_parallel_split_mappings(num_layers):
 
         return mappings
 
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: LlamaConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "layers.0.self_attn.q_proj.weight",
+            "layers.0.self_attn.k_proj.weight",
+            "layers.0.self_attn.v_proj.weight",
+            "layers.0.self_attn.qkv_proj.weight",
+        )
+
+        fuse_gate_up_keys = (
+            "layers.0.mlp.gate_proj.weight",
+            "layers.0.mlp.up_proj.weight",
+            "layers.0.mlp.gate_up_fused_proj.weight",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+        fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys])
+                    final_actions[keys] = partial(
+                        fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                    )
+            if fuse_attention_ffn:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys])
+                    final_actions[keys] = fn
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys])
+                    final_actions[keys] = partial(
+                        fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                    )
+            if not fuse_attention_ffn:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys])
+                    final_actions[keys] = partial(fn, split_nums=2)
+        return final_actions
+
     def _init_weights(self, layer):
         """Initialization hook"""
         if self.config.tensor_parallel_degree > 1:
@@ -1419,6 +1487,11 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
             y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float16")
             expanded_attn_mask = expanded_attn_mask.astype("float16")
             expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype)
+        elif get_env_device() == "xpu":
+            x = paddle.to_tensor(0.0, dtype=dtype)
+            y = paddle.to_tensor(paddle.finfo(dtype).min, dtype=dtype)
+            expanded_attn_mask = expanded_attn_mask.astype(dtype)
+            expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype)
         else:
             expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
         return expanded_attn_mask
@@ -1543,6 +1616,7 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
         )  # [bs, 1, seq_len, seq_len]
+        is_casual = False
         if self.config.use_flash_attention:
             if get_env_device() != "npu":
                 is_casual = is_casual_mask(attention_mask)
@@ -1587,6 +1661,7 @@ def forward(
                     past_key_value,
                     use_cache,
                     alibi=alibi,
+                    npu_is_casual=is_casual,
                 )
 
             # NOTE: clear outdate cache after it has been used for memory saving
@@ -1698,6 +1773,15 @@ def __init__(self, config: LlamaConfig):
         self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
         if self.weight.is_distributed:
             self.weight.split_axis = 1
+        if get_env_device() == "xpu":
+            try:
+                from paddle_xpu.layers.nn import (  # noqa: F401
+                    parallel_matmul as xpu_parallel_matmul,
+                )
+
+                self.xpu_parallel_matmul = xpu_parallel_matmul()
+            except ImportError:
+                self.xpu_parallel_matmul = None
 
     def forward(self, hidden_states, tensor_parallel_output=None):
         if self.config.sequence_parallel:
@@ -1711,7 +1795,12 @@ def forward(self, hidden_states, tensor_parallel_output=None):
         if tensor_parallel_output is None:
             tensor_parallel_output = self.config.tensor_parallel_output
 
-        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        if get_env_device() == "xpu" and self.xpu_parallel_matmul is not None:
+            logits = self.xpu_parallel_matmul(
+                hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output, training=self.training
+            )
+        else:
+            logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
         return logits
 
 
diff --git a/paddlenlp/transformers/llama/modeling_pp.py b/paddlenlp/transformers/llama/modeling_pp.py
index 73600aa6b420..dd2a91814231 100644
--- a/paddlenlp/transformers/llama/modeling_pp.py
+++ b/paddlenlp/transformers/llama/modeling_pp.py
@@ -210,6 +210,7 @@ class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
     config_class = LlamaConfig
 
     _get_tensor_parallel_mappings = LlamaPretrainedModel._get_tensor_parallel_mappings
+    _get_fuse_or_split_param_mappings = LlamaPretrainedModel._get_fuse_or_split_param_mappings
     _init_weights = LlamaPretrainedModel._init_weights
     _keys_to_ignore_on_load_unexpected = LlamaPretrainedModel._keys_to_ignore_on_load_unexpected
 
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index 9c9af9bbc694..722bde20ee70 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -108,9 +108,6 @@ def unwrap_optimizer(optimizer, optimizer_instances=()):
 
 
 if is_safetensors_available():
-
-    # from safetensors import safe_open
-    # from safetensors.numpy import load_file as safe_load_file
     from safetensors.numpy import save_file as safe_save_file
 
     from paddlenlp.utils.safetensors import fast_load_file as safe_load_file
@@ -1841,6 +1838,25 @@ def _find_mismatched_keys(
                         del state_dict[checkpoint_key]
             return mismatched_keys
 
+        def _fuse_or_split_keys(
+            state_dict, config, loaded_keys, pre_tensor_parallel_split=False, resume_state_dict=None
+        ):
+            if resume_state_dict is not None:
+                state_dict.update(resume_state_dict)
+
+            before_fuse_keys = list(state_dict.keys())
+            if pre_tensor_parallel_split:
+                tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True)
+            else:
+                tp_actions = None
+            state_dict, resume_state_dict = cls.convert_fuse_and_split(config, state_dict, tp_actions)
+            after_fuse_keys = list(state_dict.keys())
+
+            fused_keys = list(set(before_fuse_keys) - set(after_fuse_keys))
+            new_keys = list(set(after_fuse_keys) - set(before_fuse_keys))
+
+            return state_dict, resume_state_dict, fused_keys, new_keys
+
         if state_dict is not None:
             # DONT Hold tensor parallel here, only hold afer load state dict.
             # Whole checkpoint
@@ -1850,6 +1866,16 @@ def _find_mismatched_keys(
 
             state_dict = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict)
 
+            # have loaded all state_dict, no resume state_dict
+            state_dict, _, fused_keys, new_keys = _fuse_or_split_keys(
+                state_dict,
+                config,
+                loaded_keys,
+                pre_tensor_parallel_split=True if config.tensor_parallel_degree > 1 else False,
+            )
+            missing_keys = list(set(missing_keys) - set(new_keys))
+            unexpected_keys = list(set(unexpected_keys) - set(fused_keys))
+
             mismatched_keys = _find_mismatched_keys(
                 state_dict,
                 model_state_dict,
@@ -1881,7 +1907,7 @@ def _find_mismatched_keys(
 
             error_msgs = []
             mismatched_keys = []
-
+            resume_state_dict = {}
             if len(resolved_archive_file) > 1:
                 resolved_archive_file = tqdm(resolved_archive_file, desc="Loading checkpoint shards")
 
@@ -1894,13 +1920,42 @@ def _find_mismatched_keys(
                 ):
                     pre_tensor_parallel_split = True
                     assert loaded_keys is not None, "loaded_keys is not None."
-                    tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys)
+                    tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True)
                 # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
+                filter_dict_keys = set(expected_keys)
+                fuse_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True)
+                split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False)
+                for k in list(fuse_actions.keys()):
+                    need_add_except_key = k[-1] in expected_keys
+                    if need_add_except_key:
+                        filter_dict_keys |= set(k[:-1])
+                for k in list(split_actions.keys()):
+                    need_add_except_key = False
+                    for item in k[:-1]:
+                        if item in expected_keys:
+                            need_add_except_key = True
+                            break
+                    if need_add_except_key:
+                        filter_dict_keys.add(k[-1])
+
+                if config.quantization_config.is_weight_quantize():
+                    filter_dict_keys = None
+
                 state_dict = load_state_dict(
-                    shard_file,
-                    tp_actions if pre_tensor_parallel_split else None,
-                    None if config.quantization_config.is_weight_quantize() else set(expected_keys),
+                    shard_file, tp_actions if pre_tensor_parallel_split else None, filter_dict_keys
+                )
+
+                # convert for fusing or splitting weights
+                state_dict, resume_state_dict, fused_keys, new_keys = _fuse_or_split_keys(
+                    state_dict,
+                    config,
+                    loaded_keys,
+                    pre_tensor_parallel_split=pre_tensor_parallel_split,
+                    resume_state_dict=resume_state_dict,
                 )
+                missing_keys = list(set(missing_keys) - set(new_keys))
+                unexpected_keys = list(set(unexpected_keys) - set(fused_keys))
+
                 if config.quantization_config.is_weight_quantize():
                     state_dict = convert_to_quantize_state_dict(
                         state_dict,
diff --git a/paddlenlp/transformers/opt/configuration.py b/paddlenlp/transformers/opt/configuration.py
index 866da043198e..3f6f23c1c65d 100644
--- a/paddlenlp/transformers/opt/configuration.py
+++ b/paddlenlp/transformers/opt/configuration.py
@@ -146,6 +146,8 @@ def __init__(
         eos_token_id=2,
         enable_bias: bool = True,
         mp_degree: int = 1,
+        fuse_attention_qkv=False,
+        fuse_attention_ffn=False,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -165,3 +167,6 @@ def __init__(
 
         self.enable_bias = enable_bias
         self.mp_degree = mp_degree
+
+        self.fuse_attention_qkv = fuse_attention_qkv
+        self.fuse_attention_ffn = fuse_attention_ffn
diff --git a/paddlenlp/transformers/opt/modeling.py b/paddlenlp/transformers/opt/modeling.py
index c9217f316415..bf1cec55eb16 100644
--- a/paddlenlp/transformers/opt/modeling.py
+++ b/paddlenlp/transformers/opt/modeling.py
@@ -649,6 +649,49 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True):
 
         return actions
 
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: OPTConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "decoder.layers.0.self_attn.q_proj.weight",
+            "decoder.layers.0.self_attn.k_proj.weight",
+            "decoder.layers.0.self_attn.v_proj.weight",
+            "decoder.layers.0.self_attn.qkv_proj.weight",
+        )
+        fuse_qkv_bias_keys = (
+            "decoder.layers.0.self_attn.q_proj.bias",
+            "decoder.layers.0.self_attn.k_proj.bias",
+            "decoder.layers.0.self_attn.v_proj.bias",
+            "decoder.layers.0.self_attn.qkv_proj.bias",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        return final_actions
+
     @classmethod
     def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]:
         mappings: list[StateDictNameMapping] = []
diff --git a/tests/transformers/test_conversion_common.py b/tests/transformers/test_conversion_common.py
new file mode 100644
index 000000000000..d04929a7c7dd
--- /dev/null
+++ b/tests/transformers/test_conversion_common.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import glob
+import os
+import tempfile
+import unittest
+
+import paddle
+
+input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+
+
+def prepare_default_config(config):
+    config = copy.deepcopy(config)
+    config.hidden_size = 512
+    config.num_layers = 2
+    config.num_hidden_layers = 2
+    config.num_attention_heads = 16
+    config.num_key_value_heads = 16
+    config.intermediate_size = config.hidden_size
+    config.word_embed_proj_dim = 512
+    return config
+
+
+def prepare_split_config(config):
+    config = prepare_default_config(config)
+    config = copy.deepcopy(config)
+    config.fuse_attention_qkv = False
+    config.fuse_attention_ffn = False
+    return config
+
+
+def prepare_fuse_config(config):
+    config = prepare_default_config(config)
+    config = copy.deepcopy(config)
+    config.fuse_attention_qkv = True
+    config.fuse_attention_ffn = True
+    return config
+
+
+def common_test_load(model_class, model_first, config_second, tempdir):
+    model_first.eval()
+    with paddle.no_grad():
+        first = model_first(input_ids)[0]
+
+    model_second = model_class.from_pretrained(tempdir, config=config_second)
+    model_second.eval()
+    with paddle.no_grad():
+        second = model_second(input_ids)[0]
+
+    assert paddle.allclose(paddle.mean(first), paddle.mean(second), atol=1e-5)
+    # assert paddle.allclose(first, second, atol=1e-4)
+
+    files = glob.glob(tempdir + "/*")
+    for f in files:
+        os.remove(f)
+
+
+def common_test_save_and_load(config_first, config_second, model_class):
+    model_first = model_class.from_config(config_first)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        # test load pdparams: model.pdparams
+        model_first.save_pretrained(save_dir=tempdir)
+        common_test_load(model_class, model_first, config_second, tempdir)
+
+        # test load shard pdparams: model-001-0f-008.pdparams
+        model_first.save_pretrained(tempdir, max_shard_size="5MB")
+        common_test_load(model_class, model_first, config_second, tempdir)
+
+        # test save safetensors: model.safetensors
+        model_first.save_pretrained(tempdir, safe_serialization=True)
+        common_test_load(model_class, model_first, config_second, tempdir)
+
+        # test load shard safetensors: model-001-0f-008.safetensors
+        model_first.save_pretrained(tempdir, max_shard_size="5MB", safe_serialization=True)
+        common_test_load(model_class, model_first, config_second, tempdir)
+
+
+def _test_split_to_fuse(config_class, model_class):
+    config = config_class()
+
+    config_split = prepare_split_config(config)
+    config_fuse = prepare_fuse_config(config)
+
+    # Test from splitted weights to fused weight
+    common_test_save_and_load(config_split, config_fuse, model_class)
+
+
+def _test_fuse_to_split(config_class, model_class):
+    config = config_class()
+
+    config_split = prepare_split_config(config)
+    config_fuse = prepare_fuse_config(config)
+
+    # Test from fused weight to splitted weights
+    common_test_save_and_load(config_fuse, config_split, model_class)
+
+
+def _test_fast_ffn():
+    from functools import partial
+
+    import paddle
+    from paddle import nn
+
+    from paddlenlp.transformers import PretrainedModel
+    from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+    class TestConfig(PretrainedConfig):
+        def __init__(self, fast_ffn_state=False, convert_fast_ffn=False):
+            self.fast_ffn_state = fast_ffn_state
+            self.convert_fast_ffn = convert_fast_ffn
+            super().__init__()
+
+    class TestMLP(nn.Layer):
+        def __init__(self, config):
+            super().__init__()
+            self.config = config
+            self.hidden_size = config.hidden_size
+            self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.hidden_size * 2, bias_attr=True)
+
+        def forward(self, hidden_state):
+            hidden_state = self.gate_up_fused_proj(hidden_state)
+            if self.config.use_fast_ffn:
+                x, y = paddle.chunk(hidden_state, chunks=2, axis=-1)
+            else:
+                x, y = hidden_state[..., ::2], hidden_state[..., 1::2]
+
+            return nn.functional.silu(x) * y
+
+    class TestPretrainedModel(PretrainedModel):
+        config_class = TestConfig
+
+        @classmethod
+        def _get_fuse_or_split_param_mappings(cls, config: TestConfig, is_fuse=False):
+
+            #  user defined function to get convert param mappings
+            def convert_fast_ffn_fn(fuse_params, convert_fast_ffn=False):
+                import numpy as np
+
+                concat_fn = np.concatenate
+                if isinstance(fuse_params[0], paddle.Tensor):
+                    concat_fn = paddle.concat
+
+                if convert_fast_ffn:
+                    # fast_ffn
+                    first = fuse_params[0][..., ::2]
+                    second = fuse_params[0][..., 1::2]
+                    return concat_fn([first, second], axis=-1)
+
+            fn = convert_fast_ffn_fn
+
+            convert_fast_ffn_keys = (
+                "layers.0.gate_up_fused_proj.weight",
+                "layers.0.gate_up_fused_proj.weight",
+            )
+            convert_fast_ffn_bias_keys = (
+                "layers.0.gate_up_fused_proj.bias",
+                "layers.0.gate_up_fused_proj.bias",
+            )
+            fast_ffn_state = getattr(config, "fast_ffn_state", False)
+            convert_fast_ffn = getattr(config, "convert_fast_ffn", False)
+            convert_fast_ffn &= not fast_ffn_state
+
+            final_actions = {}
+            if is_fuse:
+                # for_get_fuse_or_split_param_mappings, is_fuse have two conditions, true and false,
+                # to fit partial fuse or split conditions, is_fuse will called twice(True and False).
+                # thus, for this func, we only use one condition.
+
+                # use_fast_ffn only in one condition
+                # convert when use_fast_ffn is False
+                if convert_fast_ffn:
+                    for i in range(config.num_hidden_layers):
+                        for keys in [convert_fast_ffn_keys, convert_fast_ffn_bias_keys]:
+                            keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                            final_actions[keys] = partial(fn, convert_fast_ffn=convert_fast_ffn)
+            return final_actions
+
+        def _init_weights(self, layer):
+            if isinstance(layer, (nn.Linear, nn.Embedding)):
+                if isinstance(layer.weight, paddle.Tensor):
+                    layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=1.0, shape=layer.weight.shape))
+                if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor):
+                    layer.bias.set_value(paddle.tensor.normal(mean=0.0, std=1.0, shape=layer.bias.shape))
+
+    class TestModel(TestPretrainedModel):
+        def __init__(self, config):
+            super().__init__(config)
+            self.layers = nn.LayerList([TestMLP(config=config) for i in range(config.num_hidden_layers)])
+
+        def forward(self, hidden_state):
+            for idx, (decoder_layer) in enumerate(self.layers):
+                hidden_state = decoder_layer(hidden_state)
+            return hidden_state
+
+    class TestForCausalLM(TestPretrainedModel):
+        def __init__(self, config):
+            super().__init__(config)
+            self.config = config
+            self.embedding_layer = nn.Embedding(65535, self.config.hidden_size)
+            self.test = TestModel(config=config)
+
+        def forward(self, input_ids):
+            hidden_state = self.embedding_layer(input_ids)
+            return self.test(hidden_state)
+
+    config = TestConfig()
+    config = prepare_default_config(config)
+    config_no_fast_ffn = copy.deepcopy(config)
+    config_fast_ffn = copy.deepcopy(config)
+
+    config_no_fast_ffn.use_fast_ffn = False
+
+    config_fast_ffn.use_fast_ffn = True
+    config_fast_ffn.fast_ffn_state = False
+    config_fast_ffn.convert_fast_ffn = True
+
+    common_test_save_and_load(config_no_fast_ffn, config_fast_ffn, TestForCausalLM)
+
+
+from paddlenlp.transformers import (
+    GPTConfig,
+    GPTForCausalLM,
+    LlamaConfig,
+    LlamaForCausalLM,
+    OPTConfig,
+    OPTForCausalLM,
+)
+
+
+class TestFuseOrSplit(unittest.TestCase):
+    def test_model_split_to_fuse(self):
+        _test_split_to_fuse(LlamaConfig, LlamaForCausalLM)
+        _test_split_to_fuse(GPTConfig, GPTForCausalLM)
+        _test_split_to_fuse(OPTConfig, OPTForCausalLM)
+
+    def test_model_fuse_to_split(self):
+        _test_fuse_to_split(LlamaConfig, LlamaForCausalLM)
+        _test_fuse_to_split(GPTConfig, GPTForCausalLM)
+        _test_fuse_to_split(OPTConfig, OPTForCausalLM)
+
+    def test_model_convert_fast_ffn(self):
+        _test_fast_ffn()

From 82a71775424043042a8c672cd1e9fc09348fd594 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Wed, 29 May 2024 16:50:59 +0800
Subject: [PATCH 20/27] quick fix os.path.split (#8508)

---
 paddlenlp/transformers/model_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index 722bde20ee70..f56ad381f36e 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -1916,7 +1916,7 @@ def _fuse_or_split_keys(
                 if (
                     shard_file.endswith(".safetensors")
                     and config.tensor_parallel_degree > 1
-                    and "tp" not in os.path.spilt(shard_file)[-1]
+                    and "tp" not in os.path.split(shard_file)[-1]
                 ):
                     pre_tensor_parallel_split = True
                     assert loaded_keys is not None, "loaded_keys is not None."

From 4d33655aa064a70c878983d5f2e05dc1d30dc2fc Mon Sep 17 00:00:00 2001
From: Ferrebo <kebo01@baidu.com>
Date: Mon, 3 Jun 2024 17:44:02 +0800
Subject: [PATCH 21/27] [fea] Cherry-picked MOE updates from develop (#8531)

* [fea] moe support (#8498)

Co-authored-by: kebo01 <kebo01@baidu.com>

* [fix] Broadcast optimizer state using broadcast_dp without shard-reshard. (#8522)
---
 docs/trainer.md                           |   4 +
 paddlenlp/trainer/trainer.py              | 104 +++++++++++++++-------
 paddlenlp/trainer/training_args.py        |  30 ++++++-
 paddlenlp/trainer/utils/helper.py         |  59 ++++++++++++
 paddlenlp/trainer/utils/reshard/common.py |  22 ++++-
 paddlenlp/trainer/utils/sharding_io.py    |  24 +++--
 6 files changed, 198 insertions(+), 45 deletions(-)

diff --git a/docs/trainer.md b/docs/trainer.md
index beab064bdf22..23139ed6102d 100644
--- a/docs/trainer.md
+++ b/docs/trainer.md
@@ -705,4 +705,8 @@ Trainer 是一个简单，但功能完整的 Paddle训练和评估模块，并
                        Whether use flatten_param_grads method in optimizer,
                        only used on NPU devices.(default:False)
 
+  --use_expert_parallel
+                       Whether to enable MoE (Mixture of Experts) expert parallel training.
+                       (default: False)
+
 ```
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index f507b5c8b92f..116c3451f95f 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -143,6 +143,7 @@
 from .utils import reshard as reshard_util
 from .utils.helper import (  # nested_truncate,
     broadcast_dp_optimizer,
+    broadcast_moe_optimizer,
     distributed_concat,
     distributed_file,
     distributed_isfile,
@@ -565,7 +566,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None):
             )
             self.model.set_state_dict(state_dict)
         else:
-            if resume_from_checkpoint is not None and self.args.dataset_rank == 0:
+            if resume_from_checkpoint is not None and (self.args.dataset_rank == 0 or self.args.use_expert_parallel):
 
                 weights_file = os.path.join(
                     resume_from_checkpoint, _add_variant(weight_name, self.args.weight_name_suffix)
@@ -581,7 +582,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None):
                         weights_index_file,
                     ]
                 ):
-                    raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+                    raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint} -- {weights_file}")
 
                 logger.info(f"Loading model from {resume_from_checkpoint} .")
 
@@ -930,22 +931,17 @@ def _inner_training_loop(
                     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
                     self.timers and self.timers("forward-backward").start()
 
-                dp_enabled = (
-                    self.args.data_parallel_degree > 1 if self.args.use_hybrid_parallel else args.local_rank != -1
-                )
-                forbidden_no_sync = False
                 # stage2 and stage3 should not no_sync, because the is no DDP wrapper and no_sync API
                 # hybrid_parallel (tp or pp or sharding stage 1) should not no_sync
-                if self.args.use_hybrid_parallel:
-                    forbidden_no_sync = True
-
-                availiable_no_sync = dp_enabled and not forbidden_no_sync
-
+                availiable_no_sync = hasattr(model, "no_sync")
                 is_no_sync = (
-                    ((step_control + 1) % args.gradient_accumulation_steps != 0)
-                    and availiable_no_sync
-                    and args._no_sync_in_gradient_accumulation
-                ) or (args.recompute and availiable_no_sync)
+                    (
+                        ((step_control + 1) % args.gradient_accumulation_steps != 0)
+                        and args._no_sync_in_gradient_accumulation
+                    )
+                    or args.recompute
+                    or args.use_expert_parallel
+                ) and availiable_no_sync
                 # sharding
                 # stage1. the same as ddp
                 # stage2. manualy collect gradient on dp group
@@ -965,6 +961,14 @@ def _inner_training_loop(
 
                 tr_loss += tr_loss_step
 
+                def fused_allreduce_gradients_no_sync(paramlist, hcg):
+                    paramlist = list(paramlist)
+                    nonmoe_list = [p for p in paramlist if not getattr(p, "no_sync", False)]
+                    moelist = [p for p in paramlist if getattr(p, "no_sync", False)]
+                    if moelist and not self.args.use_expert_parallel:
+                        logger.warning("found `no sync` param when `use_expert_parallel=False`")
+                    fused_allreduce_gradients(nonmoe_list, hcg)
+
                 if (step_control + 1) % args.gradient_accumulation_steps == 0 or (
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
                     steps_in_epoch <= args.gradient_accumulation_steps
@@ -983,12 +987,12 @@ def _inner_training_loop(
 
                     # Case 1: Use recompute and dp / sharding stage1,
                     # manualy collect gradient for dp.
-                    if args.recompute and availiable_no_sync:
-                        fused_allreduce_gradients(list(model.parameters()), None)
+                    if (args.recompute or args.use_expert_parallel) and availiable_no_sync:
+                        fused_allreduce_gradients_no_sync(list(model.parameters()), None)
 
                     # Case 2: hack dp with master_grad
-                    if dp_master_grad and not (args.recompute and availiable_no_sync):
-                        fused_allreduce_gradients(list(model.parameters()), None)
+                    elif dp_master_grad:
+                        fused_allreduce_gradients_no_sync(list(model.parameters()), None)
 
                     # Pipeline parallel mode,  handle gradient reduce here to overlap
                     pipeline_parallel_config = (
@@ -1007,8 +1011,7 @@ def _inner_training_loop(
                                 self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg)
 
                             if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False):
-                                fused_allreduce_gradients(list(parameters_list), self.optimizer._hcg)
-
+                                fused_allreduce_gradients_no_sync(list(parameters_list), self.optimizer._hcg)
                     self.timers and self.timers("all-reduce").stop()
                     self.timers and self.timers("optimizer-step").start()
 
@@ -1028,6 +1031,8 @@ def _inner_training_loop(
                     )
                     optimizer_was_run = True
                     if self.do_grad_scaling:
+                        if args.pipeline_parallel_degree > 1:
+                            assert not self.args.use_expert_parallel, "pipeline moe not work under fp16"
                         scale_before = paddle.assign(self.scaler._scale)
                         self.scaler.step(self.optimizer)
                         self.scaler.update()
@@ -2042,7 +2047,6 @@ def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor,
 
         model.train()
         inputs = self._prepare_inputs(inputs)
-
         with self.autocast_smart_context_manager():
             loss = self.compute_loss(model, inputs)
 
@@ -2053,7 +2057,6 @@ def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor,
             self.scaler.scale(loss).backward()
         else:
             loss.backward()
-
         return loss.detach()
 
     def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
@@ -2143,6 +2146,26 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op
                 # For ckpt integrity
                 paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done"))
 
+    def _filter_moe_no_sync_optimizer_params(self):
+        """
+        filter optimizer params which should not sync
+        """
+        state_dict = self.model.state_dict()
+        optimzier_state_dict = self.optimizer.state_dict()
+        filter_optimzier_state_dict = OrderedDict()
+        param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys()) if self.args.bf16 else []
+        filter_optimzier_state_dict["master_weights"] = OrderedDict()
+        for k, v in state_dict.items():
+            if getattr(v, "no_sync", False):
+                if v.name in param_names_in_master_weights:
+                    filter_optimzier_state_dict["master_weights"][v.name] = optimzier_state_dict["master_weights"][
+                        v.name
+                    ]
+                for op_k, op_v in optimzier_state_dict.items():
+                    if op_k.startswith(v.name):
+                        filter_optimzier_state_dict[op_k] = op_v
+        return filter_optimzier_state_dict
+
     def _save_checkpoint(self, model, metrics=None):
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
         self.runtime_timer.start("checkpoint saving time")
@@ -2165,7 +2188,7 @@ def _save_checkpoint(self, model, metrics=None):
             optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix)
 
             if self.args.use_hybrid_parallel:
-                if self.dp_group.rank <= 0:
+                if self.dp_group.rank <= 0 or self.args.use_expert_parallel:
                     os.makedirs(output_dir, exist_ok=True)
                     logger.info("Saving optimizer files.")
                     if self.args.unified_checkpoint:
@@ -2177,12 +2200,18 @@ def _save_checkpoint(self, model, metrics=None):
                             safe_serialization=True,
                         )
                     else:
-                        self._save_ckpt_func(
-                            self.optimizer.state_dict(),
-                            os.path.join(output_dir, optimizer_name),
-                        )
+                        if self.dp_group.rank > 0:  # this should only work for MoE saving
+                            self._save_ckpt_func(
+                                self._filter_moe_no_sync_optimizer_params(),
+                                os.path.join(output_dir, optimizer_name),
+                            )
+                        else:
+                            self._save_ckpt_func(
+                                self.optimizer.state_dict(),
+                                os.path.join(output_dir, optimizer_name),
+                            )
 
-            if self.args.should_save:
+            if self.args.should_save or self.args.use_expert_parallel:
                 if not self.args.use_hybrid_parallel:
                     logger.info("Saving optimizer files.")
                     if self.args.unified_checkpoint:
@@ -2194,7 +2223,12 @@ def _save_checkpoint(self, model, metrics=None):
                             safe_serialization=True,
                         )
                     else:
-                        self._save_ckpt_func(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+                        if self.args.data_parallel_rank > 0 and self.args.use_expert_parallel:
+                            self._save_ckpt_func(
+                                self._filter_moe_no_sync_optimizer_params(), os.path.join(output_dir, OPTIMIZER_NAME)
+                            )
+                        else:
+                            self._save_ckpt_func(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
                 # FIXME: maybe only save one copy
                 paddle.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
@@ -2452,7 +2486,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                     logger.info("Loading checkpoint, the next checkpoint will be saved as unified checkpoint")
 
             if not use_unified_checkpoint:
-                if self.args.data_parallel_rank == 0:
+                if self.args.data_parallel_rank == 0 or self.args.use_expert_parallel:
                     optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix)
                     path = os.path.join(checkpoint, optimizer_name)
                     if os.path.isfile(path):
@@ -2476,7 +2510,13 @@ def _load_optimizer_and_scheduler(self, checkpoint):
         # broadcast optimizer state in dp group
         if self.args.local_rank != -1:
             dist.barrier()
-        opt_state_dict = broadcast_dp_optimizer(opt_state_dict)
+        if self.args.use_expert_parallel:
+            opt_state_dict = broadcast_moe_optimizer(
+                opt_state_dict, broadcast_dp=not self.args.should_load_sharding_stage1_model
+            )
+        else:
+            if not self.args.should_load_sharding_stage1_model:
+                opt_state_dict = broadcast_dp_optimizer(opt_state_dict)
 
         if opt_state_dict is not None:
             # Load in optimizer and scheduler states
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index 3118178608d2..f825c308ebb8 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -791,6 +791,10 @@ class TrainingArguments:
         default=False,
         metadata={"help": "whether to output logits in distributed status"},
     )
+    use_expert_parallel: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
+    )
 
     def __post_init__(self):
         env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
@@ -1117,6 +1121,8 @@ def is_segment_parallel_supported():
                         order = ["dp", "sharding", "pp", "sep", "mp"]
                     else:
                         order = ["dp", "sharding", "pp", "mp"]
+                if self.use_expert_parallel:
+                    order = order[1:-1] + ["dp", "mp"]
 
                 if is_segment_parallel_supported():
                     hybrid_configs = {
@@ -1598,9 +1604,12 @@ def optimizer_name_suffix(self):
             if self.sharding_parallel_degree > 1:
                 assert self.sharding_parallel_degree < 100, "sharding parallel degree should be less than 100."
                 name.append(f"shard{self.sharding_parallel_rank:0>2d}")
-
+            if self.use_expert_parallel:
+                name.append(f"moe{self.data_parallel_rank:0>2d}")
             return "_".join(name)
         else:
+            if self.use_expert_parallel:
+                return f"moe{self.data_parallel_rank:0>2d}"
             return None
 
     @property
@@ -1613,12 +1622,16 @@ def weight_name_suffix(self):
             if self.pipeline_parallel_degree > 1:
                 assert self.pipeline_parallel_degree < 100, "tensor parallel rank should be less than 100."
                 name.append(f"pp{self.pipeline_parallel_rank:0>2d}")
+            if self.use_expert_parallel:
+                name.append(f"moe{self.data_parallel_rank:0>2d}")
             return "_".join(name)
 
         else:
+            if self.use_expert_parallel:
+                return f"moe{self.data_parallel_rank:0>2d}"
             return None
 
-    def sharded_name_suffix(self, shard_id=None, pp_id=None):
+    def sharded_name_suffix(self, shard_id=None, pp_id=None, moe_id=None):
         if self.use_hybrid_parallel:
             name = []
             if self.tensor_parallel_degree > 1:
@@ -1636,8 +1649,17 @@ def sharded_name_suffix(self, shard_id=None, pp_id=None):
                 assert isinstance(shard_id, int)
                 assert shard_id < 100, "shard_id should be less than 100."
                 name.append(f"shard{shard_id:0>2d}")
+            if self.use_expert_parallel:
+                if moe_id is None:
+                    moe_id = self.data_parallel_rank
+                assert isinstance(moe_id, int)
+                name.append(f"moe{moe_id:0>2d}")
             return "_".join(name)
         else:
+            if self.use_expert_parallel:
+                if moe_id is None:
+                    moe_id = self.data_parallel_rank
+                return self._format_name("moe", moe_id, self.data_parallel_degree)
             return None
 
     @property
@@ -1730,9 +1752,9 @@ def should_save_model_state(self):
                 return True
             elif self.use_hybrid_parallel:
                 # save on dataset rank 0
-                return self.sharding_parallel_rank == 0 and self.data_parallel_rank == 0
+                return self.sharding_parallel_rank == 0 and (self.data_parallel_rank == 0 or self.use_expert_parallel)
             else:
-                return self.process_index == 0
+                return self.process_index == 0 or self.use_expert_parallel
 
     @property
     def _no_sync_in_gradient_accumulation(self):
diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py
index 25f593f71e35..8e4c22e908f5 100644
--- a/paddlenlp/trainer/utils/helper.py
+++ b/paddlenlp/trainer/utils/helper.py
@@ -226,3 +226,62 @@ def broadcast_dp_optimizer(state_dict):
     state_dict = nested_broadcast_tensor(state_dict, src=src_rank, group=dp_group)
 
     return state_dict
+
+
+def broadcast_moe_optimizer(state_dict, broadcast_dp=True):
+
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        dp_group = hcg.get_data_parallel_group()
+        src_rank = hcg.get_data_parallel_group_src_rank()
+        data_parallel_rank = hcg.get_data_parallel_rank()
+        # Don't broadcast optimizer for dp rank is 1.
+        if dp_group.nranks <= 1:
+            return state_dict
+    except:
+        dp_group = None
+        src_rank = 0
+        data_parallel_rank = 0
+
+    def _broadcast_moe_optimizer_state(state_dict):
+        # boardcast_keys
+        base_state_dict = {"master_weights": {}}
+        buf = [
+            {i: j.shape for i, j in state_dict.items() if i not in ["master_weights", "LR_Scheduler"]},
+            {i: j.shape for i, j in state_dict["master_weights"].items()},
+            {"LR_Scheduler": state_dict.get("LR_Scheduler", {})},
+        ]
+
+        dist.broadcast_object_list(buf, src=src_rank, group=dp_group)
+        # logger.info(f"moe-optimizer-gather-keys{buf}")
+        for k, s in buf[0].items():
+            v = state_dict.get(k, paddle.zeros(s, "float32")).cuda()
+            v.name = k
+            # k = k.replace("_fp32_master_0", "")
+            dist.broadcast(v, src=src_rank, group=dp_group)
+            logger.info(f"broadcast moe optimizer {k} from {src_rank}")
+            base_state_dict[k] = v.cpu()
+        for k, s in buf[1].items():
+            v = state_dict["master_weights"].get(k, paddle.zeros(s, "float32")).cuda()
+            v.name = k
+            dist.broadcast(v, src=src_rank, group=dp_group)
+            logger.info(f"broadcast moe optimizer-master_weights {k} from {src_rank}")
+            base_state_dict["master_weights"][k] = v.cpu()
+        base_state_dict.update(buf[2])
+        return base_state_dict
+
+    if broadcast_dp:
+        base_state_dict = broadcast_dp_optimizer(state_dict)
+    else:
+        base_state_dict = _broadcast_moe_optimizer_state(state_dict)
+    if data_parallel_rank > 0:
+        master_weight = state_dict.pop("master_weights", {})
+        base_state_dict.update(state_dict)
+        if master_weight:
+            if "master_weights" in base_state_dict:
+                base_state_dict["master_weights"].update(master_weight)
+            else:
+                base_state_dict["master_weights"] = master_weight
+        state_dict = base_state_dict
+        del base_state_dict
+    return state_dict
diff --git a/paddlenlp/trainer/utils/reshard/common.py b/paddlenlp/trainer/utils/reshard/common.py
index cc834862e299..66e3c3569916 100644
--- a/paddlenlp/trainer/utils/reshard/common.py
+++ b/paddlenlp/trainer/utils/reshard/common.py
@@ -266,6 +266,16 @@ def _opt_name_to_tname(tensor_names, opt_names):
             all_names.extend(opt_names)
             all_names.sort()
             pre_t_name = ""
+            suffix = [
+                "_fp32_master_0_beta1_pow_acc_0",
+                "_fp32_master_0_beta2_pow_acc_0",
+                "_fp32_master_0_moment1_0",
+                "_fp32_master_0_moment2_0",
+                "_beta1_pow_acc_0",
+                "_beta2_pow_acc_0",
+                "_moment1_0",
+                "_moment2_0",
+            ]
             opt_to_t = {}
             for n in all_names:
                 if n in tensor_names:
@@ -274,6 +284,16 @@ def _opt_name_to_tname(tensor_names, opt_names):
                 else:
                     assert pre_t_name
                     opt_to_t[n] = pre_t_name
+
+            for t in opt_names:
+                _find = False
+                for s in suffix:
+                    if t.endswith(s):
+                        logger.info(f"{t}-{t[:-len(s)]}--{t[:-len(s)] in tensor_names}")
+                        opt_to_t[t] = t[: -len(s)]
+                        _find = True
+                        break
+                assert _find
             return opt_to_t
 
         if structure_name_mapping is not None:
@@ -291,7 +311,7 @@ def _opt_name_to_tname(tensor_names, opt_names):
         (self._model_weights, model_weights_tmp) = (model_weights_tmp, self._model_weights)
         for k in list(model_weights_tmp.keys()):
             t_name = structure_name_mapping[k]
-            self._model_weights[(k, t_name)] = model_weights_tmp[k].cpu()
+            self._model_weights[(k, t_name)] = paddle.to_tensor(model_weights_tmp[k]).cpu()
             del model_weights_tmp[k]
 
         # opt
diff --git a/paddlenlp/trainer/utils/sharding_io.py b/paddlenlp/trainer/utils/sharding_io.py
index 56f4c426ce0a..4fe55d175005 100644
--- a/paddlenlp/trainer/utils/sharding_io.py
+++ b/paddlenlp/trainer/utils/sharding_io.py
@@ -67,11 +67,14 @@ def filter_sharded_params(state_dict, optimizer, sharding_group):
     if reshard_util.get_sharding_strategy(optimizer) == reshard_util.SHARDING_STRATEGY_V1:
         optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizer)
         for (k, v) in state_dict.items():
-            assert v.name in optimizer._param2rank
-            sharded_rank = optimizer._param2rank[v.name]
-            if sharded_rank != sharding_rank:
-                continue
-            filtered_state_dict[k] = v
+            if v.name in optimizer._param2rank:
+                sharded_rank = optimizer._param2rank[v.name]
+                if sharded_rank != sharding_rank:
+                    continue
+                filtered_state_dict[k] = v
+            else:
+                if sharding_rank == 0:
+                    filtered_state_dict[k] = v
     else:
         optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizerV2)
         parameters = optimizer._parameter_list
@@ -352,7 +355,7 @@ def manipulate_state_dict_and_config(self, model_to_save, merge_tensor_parallel=
             )
             logger.info(
                 "param_names_in_master_weights len:{}, bf16 state_dict len:{}, :{}".format(
-                    len(param_names_in_master_weights), len(state_dict), state_dict
+                    len(param_names_in_master_weights), len(state_dict), state_dict.keys()
                 )
             )
         return state_dict, config_to_save, weight_name_suffix
@@ -444,12 +447,17 @@ def filter_func(name):
 
         master_weights = reshard_util.all_gather_state_dict(master_weights, filter_func, self.sharding_group)
         model_state_dict = self.model.state_dict()
+        logger.info(f"state-dict-keys: {state_dict.keys()}, nums: {len(state_dict.keys())}")
         logger.info("before recover, model_state_dict number: {}".format(len(model_state_dict)))
         for key, param in model_state_dict.items():
             if param.name in master_weights:
                 assert param.shape == master_weights[param.name].shape
-                paddle.assign(master_weights[param.name].cuda(), model_state_dict[key])
-
+                paddle.assign(paddle.cast(master_weights[param.name].cuda(), paddle.bfloat16), model_state_dict[key])
+            elif key in state_dict:
+                logger.info(f"key: {key} is in state_dict, but not in master_weights")
+                paddle.assign(state_dict[key], model_state_dict[key])
+            else:
+                logger.info(f"key: {key} is not in state_dict and master_weights")
         logger.info("after recover, casted model_state_dict number: {}".format(len(model_state_dict)))
         state_dict.update(model_state_dict)
         return state_dict

From 6757ff9d436baa20284b411648b6029b3b377e2c Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Mon, 3 Jun 2024 20:13:07 +0800
Subject: [PATCH 22/27] [LLM] relocate tensor_parallel_output to avoid conflict
 (#8419) (#8533)

Co-authored-by: Tian <121000916+SylarTiaNII@users.noreply.github.com>
---
 llm/finetune_generation.py         | 5 +++++
 llm/utils.py                       | 2 +-
 paddlenlp/trainer/training_args.py | 4 ----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py
index c8fed17165af..6e4123b02df2 100644
--- a/llm/finetune_generation.py
+++ b/llm/finetune_generation.py
@@ -16,6 +16,7 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
+from typing import Optional
 
 import paddle
 from argument import (
@@ -66,6 +67,10 @@ class FinetuneArguments(TrainingArguments):
         default=0,
         metadata={"help": "The steps use to control the learing rate."},
     )
+    tensor_parallel_output: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to output logits in distributed status"},
+    )
 
 
 def read_local_dataset(path):
diff --git a/llm/utils.py b/llm/utils.py
index 6688357bd67b..3075943877df 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -212,7 +212,7 @@ def prediction_step(
             if isinstance(logits, (list, tuple)):
                 logits = logits[0]
             # all gather logits when enabling tensor_parallel_output
-            if self.args.tensor_parallel_degree > 1 and self.args.tensor_parallel_output:
+            if self.args.tensor_parallel_degree > 1 and getattr(self.args, "tensor_parallel_output", False):
                 hcg = fleet.get_hybrid_communicate_group()
                 model_parallel_group = hcg.get_model_parallel_group()
                 gathered_logits = []
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index f825c308ebb8..423d77d6f510 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -787,10 +787,6 @@ class TrainingArguments:
         default=False,
         metadata={"help": "whether to run distributed training in auto parallel mode"},
     )
-    tensor_parallel_output: Optional[bool] = field(
-        default=False,
-        metadata={"help": "whether to output logits in distributed status"},
-    )
     use_expert_parallel: Optional[bool] = field(
         default=False,
         metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},

From 7c8d713de8475c807f53818eafe4c160e4fab1f0 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Wed, 5 Jun 2024 17:16:26 +0800
Subject: [PATCH 23/27] Update sequence_parallel for predict (#8547)

---
 paddlenlp/trainer/trainer.py           |  6 +++++-
 paddlenlp/transformers/linear_utils.py | 14 +++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index 116c3451f95f..dfc47354c493 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -40,7 +40,11 @@
 import paddle.nn as nn
 from packaging import version
 from paddle import framework
-from paddle.base import core
+
+try:
+    from paddle.base import core
+except:
+    core = None
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
     HybridParallelOptimizer,
diff --git a/paddlenlp/transformers/linear_utils.py b/paddlenlp/transformers/linear_utils.py
index de1a0f886b79..469e7c45985e 100644
--- a/paddlenlp/transformers/linear_utils.py
+++ b/paddlenlp/transformers/linear_utils.py
@@ -18,7 +18,11 @@
 
 import paddle.distributed.fleet.meta_parallel as mpu
 from paddle import nn
-from paddle.distributed.fleet.utils import sequence_parallel_utils
+
+try:
+    from paddle.distributed.fleet.utils import sequence_parallel_utils
+except:
+    sequence_parallel_utils = None
 
 from paddlenlp.transformers.mc2_parallel_linear import (
     MC2ColumnSeqParallelLinear,
@@ -29,8 +33,12 @@
 Linear = nn.Linear
 ColumnParallelLinear = mpu.ColumnParallelLinear
 RowParallelLinear = mpu.RowParallelLinear
-ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear
-RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear
+try:
+    ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear
+    RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear
+except:
+    ColumnSequenceParallelLinear = None
+    RowSequenceParallelLinear = None
 
 if get_env_device() == "npu":
     if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:

From c628f129483384cf87a8d219bb5728490ae638bd Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 7 Jun 2024 17:55:54 +0800
Subject: [PATCH 24/27] Cp/fix (#8569)

* [Safetensors] Fix fast safe open slice. (#8512)
* [FIX DDP] fix ddp (#8549)
---
 paddlenlp/trainer/trainer.py                  |  12 +-
 paddlenlp/trainer/training_args.py            |   2 +-
 paddlenlp/utils/safetensors.py                |  12 +-
 pyproject.toml                                |   4 +-
 tests/trainer/test_lora_unified_checkpoint.py |  47 ++++----
 tests/trainer/test_unified_checkpoint.py      | 109 ++++++++++--------
 tests/transformers/test_safetensors.py        |  14 ++-
 7 files changed, 107 insertions(+), 93 deletions(-)

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
index dfc47354c493..b42e596e97e4 100644
--- a/paddlenlp/trainer/trainer.py
+++ b/paddlenlp/trainer/trainer.py
@@ -1771,16 +1771,8 @@ def _wrap_model(self, model, training=True):
         in_sep_parallel_mode = self.args.sep_parallel_degree > 1
 
         # Multi-gpu training
-        if (
-            self.args.world_size > 1
-            and not self.args.use_hybrid_parallel
-            or not (
-                in_pipeline_parallel_mode
-                or in_sharding_parallel_mode
-                or in_tensor_parallel_mode
-                or in_sep_parallel_mode
-            )
-        ):
+        if self.args.world_size > 1 and (not self.args.use_hybrid_parallel):
+            # MOE use DDP to broadcaset parameters.
             model = paddle.DataParallel(model)
             # Distributed training (should be after fp16 initialization)
 
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index 423d77d6f510..b31e55d7b4f0 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -1406,7 +1406,7 @@ def is_segment_parallel_supported():
             if world_size > 1:
                 if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
                     if self.unified_checkpoint:
-                        self.use_hybrid_parallel = True
+                        # DP use hybrid group
                         strategy = fleet.DistributedStrategy()
                         fleet.init(is_collective=True, strategy=strategy)
                     else:
diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py
index 422a7d09961c..c273d0d973c2 100644
--- a/paddlenlp/utils/safetensors.py
+++ b/paddlenlp/utils/safetensors.py
@@ -157,16 +157,16 @@ def __getitem__(self, index):
 
         out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step))
         for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)):
-            out_start[i] = slice_.start or 0
-            out_step[i] = slice_.step or 1
-            out_stop[i] = slice_.stop or stop - start
+            out_start[i] = slice_.start if slice_.start is not None else 0
+            out_step[i] = slice_.step if slice_.step is not None else 1
+            out_stop[i] = slice_.stop if slice_.stop is not None else stop - start
             out_stop[i] = min(stop, out_stop[i])
 
         target_shape = []
-        for x, y, z in zip(out_start, out_stop, out_step):
+        for x, y, z, sli in zip(out_start, out_stop, out_step, index):
             assert z == 1, "only support step = 1"
-            if y - x > 1:
-                target_shape.append(int(y - x))
+            if y - x > 1 or sli.step is None:
+                target_shape.append(max(int(y - x), 0))
 
         if len(target_shape) == 0:
             if self.shape == [1]:
diff --git a/pyproject.toml b/pyproject.toml
index 715323d09e37..858508037fce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ exclude = ['.flake8']
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-ra -q --ignore model_zoo/gpt-3/"
+addopts = "-ra -q --dist loadgroup"
 pythonpath = ["."]
 testpaths = [
     "tests/data",
@@ -28,7 +28,7 @@ testpaths = [
     "tests/prompt",
     # "tests/taskflow",  TODO (paddle 2.5.1 breaks this test suite, debug later)
     "tests/utils",
-    "model_zoo",
+    # "model_zoo",
 ]
 python_files = [
     "test.py",
diff --git a/tests/trainer/test_lora_unified_checkpoint.py b/tests/trainer/test_lora_unified_checkpoint.py
index 98d5516d2388..0abfc257d4f7 100644
--- a/tests/trainer/test_lora_unified_checkpoint.py
+++ b/tests/trainer/test_lora_unified_checkpoint.py
@@ -149,7 +149,7 @@ def __test__(cls):
 
     def setUp(self):
         """
-        1. update runfrist and rerun to run defined different config
+        1. update runfirst and rerun to run defined different config
         2. update need_allclose to True if you want to check the result
         3. update rtol to the relative value you want to check
         """
@@ -169,7 +169,7 @@ def setUp(self):
 
         self.run_lora_file = "llm/finetune_generation.py"
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_lora_file, **train_args)
 
     def rerun(self, train_args):
@@ -181,7 +181,7 @@ def testTP4PP2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP4PP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -196,7 +196,7 @@ def testTP2Sharding4(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP2Sharding4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -213,7 +213,7 @@ def testTP8(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -227,7 +227,7 @@ def testTP4DP2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP4DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -242,7 +242,7 @@ def testTP4Sharding2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP4Sharding2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -257,7 +257,7 @@ def testTP2PP4(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["TP2PP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -272,7 +272,7 @@ def testPP8(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["PP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -287,7 +287,7 @@ def testPP4DP2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["PP4DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -302,7 +302,7 @@ def testPP4Sharding2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["PP4Sharding2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -317,7 +317,7 @@ def testSharding8S1(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding8S1"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -332,7 +332,7 @@ def testSharding8S2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding8S2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -347,7 +347,7 @@ def testSharding4S1DP2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding4S1DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -362,7 +362,7 @@ def testSharding4S2DP2(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding4S2DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -377,7 +377,7 @@ def testSharding2S1DP4(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding2S1DP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -392,7 +392,7 @@ def testSharding2S2DP4(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["Sharding2S2DP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -407,7 +407,7 @@ def testDP8(self):
         remove_ckpt(lora_arguments["output_dir"])
 
         train_args = self.configs["DP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -416,19 +416,21 @@ def testDP8(self):
             np.testing.assert_allclose(res[0], res[1], self.rtol)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_lora_file, **train_args)
 
     def rerun(self, train_args):
         self.run_n2c4(self.run_lora_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -436,7 +438,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n1c8(self.run_lora_file, **train_args)
 
@@ -445,6 +447,7 @@ def rerun(self, train_args):
         self.run_n1c8(self.run_lora_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -452,7 +455,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n1c8(self.run_lora_file, **train_args)
 
@@ -469,7 +472,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n2c4(self.run_lora_file, **train_args)
 
diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py
index f8cc0ed7bfac..5ce99b36ff19 100644
--- a/tests/trainer/test_unified_checkpoint.py
+++ b/tests/trainer/test_unified_checkpoint.py
@@ -175,7 +175,7 @@ def __test__(cls):
 
     def setUp(self):
         """
-        1. update runfrist and rerun to run defined diffrent config
+        1. update runfirst and rerun to run defined diffrent config
         2. update need_allclose to True if you want to check the result
         3. update rtol to the relative value you want to check
         """
@@ -194,7 +194,7 @@ def setUp(self):
 
         self.run_pretrain_file = "llm/llama/run_pretrain.py"
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -206,7 +206,7 @@ def testTP4PP2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP4PP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -221,7 +221,7 @@ def testTP2Sharding4(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP2Sharding4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -238,7 +238,7 @@ def testTP8(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -252,7 +252,7 @@ def testTP4DP2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP4DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -267,7 +267,7 @@ def testTP4Sharding2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP4Sharding2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -282,7 +282,7 @@ def testTP2PP4(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["TP2PP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -297,7 +297,7 @@ def testPP8(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["PP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -312,7 +312,7 @@ def testPP4DP2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["PP4DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -327,7 +327,7 @@ def testPP4Sharding2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["PP4Sharding2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -342,7 +342,7 @@ def testSharding8S1(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding8S1"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -357,7 +357,7 @@ def testSharding8S2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding8S2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -372,7 +372,7 @@ def testSharding4S1DP2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding4S1DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -387,7 +387,7 @@ def testSharding4S2DP2(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding4S2DP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -402,7 +402,7 @@ def testSharding2S1DP4(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding2S1DP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -417,7 +417,7 @@ def testSharding2S2DP4(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["Sharding2S2DP4"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -432,7 +432,7 @@ def testDP8(self):
         remove_ckpt(pretrain_arguments["output_dir"])
 
         train_args = self.configs["DP8"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -441,13 +441,14 @@ def testDP8(self):
             np.testing.assert_allclose(res[0], res[1], self.rtol)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -463,7 +464,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -485,7 +486,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -507,7 +508,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
         move_checkpoint_N1C8_to_N2C4()
 
@@ -529,7 +530,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
         move_checkpoint_N2C4_to_N1C8()
 
@@ -557,7 +558,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -576,7 +577,7 @@ def setUp(self):
 
         self.need_allclose = False
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O1"
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
@@ -585,6 +586,7 @@ def rerun(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN1C8MasterWeightCompatibleO2ToO1(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -596,7 +598,7 @@ def setUp(self):
 
         self.need_allclose = False
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O2"
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
@@ -605,6 +607,7 @@ def rerun(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -612,7 +615,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
@@ -621,6 +624,7 @@ def rerun(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -628,7 +632,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
@@ -637,6 +641,7 @@ def rerun(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestPaddleCheckpointOnN1C2Reset(TestMultipleGpus):
     def setUp(self):
         self.configs = get_pretrain_arguments(pretrain_arguments)
@@ -653,7 +658,7 @@ def setUp(self):
 
         self.run_pretrain_file = "llm/llama/run_pretrain.py"
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n1c2(self.run_pretrain_file, **train_args)
 
@@ -669,7 +674,7 @@ def testTP2(self):
 
         train_args = self.configs["TP2"]
 
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -678,6 +683,7 @@ def testTP2(self):
             np.testing.assert_allclose(res[0], res[1], self.rtol)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN1C2Reset(TestMultipleGpus):
     def setUp(self):
         self.configs = get_pretrain_arguments(pretrain_arguments)
@@ -714,7 +720,7 @@ def setUp(self):
             "training_args.bin",
         ]
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 1
         self.run_n1c2(self.run_pretrain_file, **train_args)
 
@@ -730,7 +736,7 @@ def testTP2(self):
 
         train_args = self.configs["TP2"]
 
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         self.rerun(train_args)
 
         if self.need_allclose:
@@ -748,7 +754,7 @@ def testFileLists(self):
         base_ckpt_path = os.path.join(pretrain_arguments["output_dir"], "checkpoint-%d" % save_steps)
 
         train_args = self.configs["TP2"]
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         assert sorted(self.filelists) == sorted(os.listdir(base_ckpt_path))
         self.rerun(train_args)
 
@@ -761,7 +767,7 @@ def testFileLists(self):
         remove_logs()
         remove_ckpt(pretrain_arguments["output_dir"])
         train_args["unified_checkpoint_config"] = "skip_save_model_weight"
-        self.runfrist(train_args)
+        self.runfirst(train_args)
         unsave_filelists = [
             "master_weights-00001-of-00002.safetensors",
             "master_weights-00002-of-00002.safetensors",
@@ -788,7 +794,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -809,7 +815,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -828,7 +834,7 @@ def setUp(self):
 
         self.need_allclose = False
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O1"
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
@@ -849,7 +855,7 @@ def setUp(self):
 
         self.need_allclose = False
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O2"
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
@@ -866,7 +872,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["unified_checkpoint"] = 0
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
@@ -886,7 +892,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -909,7 +915,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
         move_checkpoint_N1C8_to_N2C4()
 
@@ -937,7 +943,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O1"
         self.run_n1c8(self.run_pretrain_file, **train_args)
         move_checkpoint_N1C8_to_N2C4()
@@ -967,7 +973,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O2"
         self.run_n1c8(self.run_pretrain_file, **train_args)
         move_checkpoint_N1C8_to_N2C4()
@@ -995,7 +1001,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
         move_checkpoint_N1C8_to_N2C4()
 
@@ -1023,7 +1029,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
         move_checkpoint_N2C4_to_N1C8()
 
@@ -1051,7 +1057,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O1"
         self.run_n2c4(self.run_pretrain_file, **train_args)
         move_checkpoint_N2C4_to_N1C8()
@@ -1081,7 +1087,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         train_args["fp16_opt_level"] = "O2"
         self.run_n2c4(self.run_pretrain_file, **train_args)
         move_checkpoint_N2C4_to_N1C8()
@@ -1109,7 +1115,7 @@ def setUp(self):
         self.rtol = 1e-4
         self.k = MAX_CONVERT_CONFIGS  # max: 16, min: 1
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n2c4(self.run_pretrain_file, **train_args)
         move_checkpoint_N2C4_to_N1C8()
 
@@ -1123,6 +1129,7 @@ def rerun(self, train_args):
             np.testing.assert_allclose(res[0], res[-1], rtol=self.rtol)
 
 
+@pytest.mark.skipif(True, reason="Skip for None CE")
 class TestUnifiedCheckpointOnN1C8EnableAll(TestUnifiedCheckpointBase):
     def setUp(self):
         super().setUp()
@@ -1133,7 +1140,7 @@ def setUp(self):
         self.need_allclose = True
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, **train_args)
 
     def rerun(self, train_args):
@@ -1153,7 +1160,7 @@ def setUp(self):
         self.need_allclose = False
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args)
 
     def rerun(self, train_args):
@@ -1172,7 +1179,7 @@ def setUp(self):
         self.need_allclose = False
         self.rtol = 1e-7
 
-    def runfrist(self, train_args):
+    def runfirst(self, train_args):
         self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args)
 
     def rerun(self, train_args):
diff --git a/tests/transformers/test_safetensors.py b/tests/transformers/test_safetensors.py
index 3c143e26a0b5..85b291e42349 100644
--- a/tests/transformers/test_safetensors.py
+++ b/tests/transformers/test_safetensors.py
@@ -28,7 +28,14 @@ class FastSafetensors(unittest.TestCase):
     def setUp(self):
         super().setUp()
         self.weigth_map = {}
-        tensors = [([10, 10], "float32"), ([8], "float16"), ([5, 5, 5], "int32")]
+        tensors = [
+            ([10, 1, 10], "float32"),
+            ([1, 1, 10], "float32"),
+            ([1, 1, 1, 10], "float32"),
+            ([10, 10], "float32"),
+            ([8], "float16"),
+            ([5, 5, 5], "int32"),
+        ]
         count = 0
         for shape, dtype in tensors:
             self.weigth_map[f"weight_{count}"] = (np.random.random(shape) * 100).astype(dtype)
@@ -53,5 +60,10 @@ def test_safe_open(self):
             with fast_safe_open(path, framework="np") as f:
                 for key in f.keys():
                     safe_slice = f.get_slice(key)
+                    # np.testing.assert_equal(self.weigth_map[key][2:1, ...], safe_slice[2:1, ...])
+                    np.testing.assert_equal(self.weigth_map[key][0, ...], safe_slice[0, ...])
+                    np.testing.assert_equal(self.weigth_map[key][0:1, ...], safe_slice[0:1, ...])
+                    np.testing.assert_equal(self.weigth_map[key][..., 2:], safe_slice[..., 2:])
+                    np.testing.assert_equal(self.weigth_map[key][..., 1], safe_slice[..., 1])
                     np.testing.assert_equal(self.weigth_map[key][:2, ...], safe_slice[:2, ...])
                     np.testing.assert_equal(self.weigth_map[key][..., :4], safe_slice[..., :4])

From 5b027c8ae5260342fd58ced5162ebafa4766cb40 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Fri, 7 Jun 2024 18:00:11 +0800
Subject: [PATCH 25/27] Don't save moe_group (#8570)

---
 paddlenlp/transformers/configuration_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py
index 4bda24695a48..b3e255f30535 100644
--- a/paddlenlp/transformers/configuration_utils.py
+++ b/paddlenlp/transformers/configuration_utils.py
@@ -903,6 +903,8 @@ def to_dict(self) -> Dict[str, Any]:
             output["model_type"] = self.__class__.model_type
         if "_auto_class" in output:
             del output["_auto_class"]
+        if "moe_group" in output:
+            del output["moe_group"]
 
         output["quantization_config"] = self.quantization_config.to_dict()
 

From db99efd4dc99047922aae9842be66ab4538f93bf Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 20 Jun 2024 15:41:29 +0800
Subject: [PATCH 26/27] release 2.8.1 (#8636)

---
 paddlenlp/__init__.py | 2 +-
 setup.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py
index e3cd7e1c5f75..9e3fef0146bb 100644
--- a/paddlenlp/__init__.py
+++ b/paddlenlp/__init__.py
@@ -18,7 +18,7 @@
 PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION"
 
 
-__version__ = "2.8.0.post"
+__version__ = "2.8.1.post"
 if os.getenv(PADDLENLP_STABLE_VERSION):
     __version__ = __version__.replace(".post", "")
 
diff --git a/setup.py b/setup.py
index 0723cfc28cb4..372d86776293 100644
--- a/setup.py
+++ b/setup.py
@@ -109,7 +109,7 @@ def show():
         f.write(content)
 
 
-__version__ = "2.8.0.post"
+__version__ = "2.8.1.post"
 if os.getenv(PADDLENLP_STABLE_VERSION):
     __version__ = __version__.replace(".post", "")
 

From ad271a648b0da4049f7d3f720f5f4b4244d7d333 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Wed, 3 Jul 2024 11:03:03 +0800
Subject: [PATCH 27/27] [Safetensors] Fix safetensors shape (#8702)

* Update sequence_parallel for predict

* Do not save moe_group

* Fix safetensors reading
---
 paddlenlp/utils/safetensors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py
index c273d0d973c2..54256023db7d 100644
--- a/paddlenlp/utils/safetensors.py
+++ b/paddlenlp/utils/safetensors.py
@@ -177,7 +177,7 @@ def __getitem__(self, index):
         span = self.bits
         for i, (start, stop, step) in enumerate(zip(out_start[::-1], out_stop[::-1], out_step[::-1])):
             if len(indices) == 0:
-                if start == 0 and stop == self.shape[i]:
+                if start == 0 and stop == self.shape[::-1][i]:
                     pass
                     #  We haven't started to slice yet, just increase the span
                 else:
@@ -194,7 +194,7 @@ def __getitem__(self, index):
                         newindices.append((old_start + offset, old_stop + offset))
                 indices = newindices
                 assert len(indices) == capacity, f"error {capacity} {len(indices)}"
-            span *= self.shape[-(i + 1)]
+            span *= self.shape[::-1][i]
 
         if len(indices) == 0:
             indices.append((0, self.nbytes))