From 0da8c5d121139c3dd900260b48e9b242d7335608 Mon Sep 17 00:00:00 2001 From: lugimzzz <63761690+lugimzzz@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:39:31 +0800 Subject: [PATCH 01/27] cherry-pick add scaling (#8264) --- paddlenlp/peft/lora/lora_config.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/paddlenlp/peft/lora/lora_config.py b/paddlenlp/peft/lora/lora_config.py index 3a0897bc4892..12e3b929ed7e 100644 --- a/paddlenlp/peft/lora/lora_config.py +++ b/paddlenlp/peft/lora/lora_config.py @@ -13,6 +13,7 @@ # limitations under the License. import json +import math import os from dataclasses import asdict, dataclass, field from typing import List, Optional, Union @@ -94,6 +95,15 @@ def __post_init__(self): ) self.use_quick_lora = False + @property + def scaling(self): + if not self.rslora and not self.pissa: + return self.lora_alpha / self.r + elif self.pissa: + return 1.0 + else: + return self.lora_alpha / math.sqrt(self.r) + @property def __dict__(self): return asdict(self) @@ -114,6 +124,7 @@ def save_pretrained(self, save_directory): os.makedirs(save_directory, exist_ok=True) output_dict = self.__dict__ + output_dict["scaling"] = self.scaling output_path = os.path.join(save_directory, LORA_CONFIG_NAME) # save it @@ -136,7 +147,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): raise ValueError(f"Can't find lora_config.json at '{pretrained_model_name_or_path}'") loaded_attributes = cls.from_json_file(config_file) - + loaded_attributes.pop("scaling", None) config = cls(**kwargs) for key, value in loaded_attributes.items(): From 4749af30726f39d4e733de023599e9bc5b438f87 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Fri, 12 Apr 2024 16:42:27 +0800 Subject: [PATCH 02/27] Upgrade paddlenlp to 2.8.0 (#8266) * Upgrade paddlenlp to 2.8.0 * fix try import * Add regex to requirements.txt --- paddlenlp/__init__.py | 2 +- requirements.txt | 1 + setup.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py index af011febd291..e3cd7e1c5f75 100644 --- a/paddlenlp/__init__.py +++ b/paddlenlp/__init__.py @@ -18,7 +18,7 @@ PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION" -__version__ = "2.7.1.post" +__version__ = "2.8.0.post" if os.getenv(PADDLENLP_STABLE_VERSION): __version__ = __version__.replace(".post", "") diff --git a/requirements.txt b/requirements.txt index 4b676d900563..71fee6049318 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ safetensors tool_helpers aistudio-sdk>=0.1.3 jinja2 +regex diff --git a/setup.py b/setup.py index ab8f03a292e1..0723cfc28cb4 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def show(): f.write(content) -__version__ = "2.7.1.post" +__version__ = "2.8.0.post" if os.getenv(PADDLENLP_STABLE_VERSION): __version__ = __version__.replace(".post", "") From 6c1f4493654d42f3ef7eb604e5839f6c8bfda855 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 15 Apr 2024 16:45:23 +0800 Subject: [PATCH 03/27] [BugFix] Try except sequence parallel utils (#8189) (#8274) * try except sp * fix sp import --- .../gpt/dygraph/hybrid_model.py | 17 +++++++------ .../models/language_model/language_module.py | 9 ++++--- paddlenlp/transformers/__init__.py | 24 +++++++++++-------- paddlenlp/transformers/gpt/modeling.py | 18 ++++++++------ paddlenlp/transformers/gpt/modeling_auto.py | 12 ++++++---- paddlenlp/transformers/gpt/modeling_pp.py | 10 +++++--- paddlenlp/transformers/llama/modeling.py | 17 +++++++------ .../mc2_seqence_parallel_linear.py | 12 ++++++---- paddlenlp/transformers/mixtral/modeling.py | 17 +++++++------ 9 files changed, 84 insertions(+), 52 deletions(-) diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index f47d800c5f15..f4c1ee8d46a7 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -48,13 +48,16 @@ MinLengthLogitsProcessor, RepetitionPenaltyLogitsProcessor, ) -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.segment_parallel_utils import ReshardLayer diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py index 1a73a35982ff..c86fa300e352 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py @@ -24,9 +24,12 @@ from ppfleetx.core.module.basic_module import BasicModule from ppfleetx.data.tokenizers import GPTTokenizer from ppfleetx.distributed.apis import env -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - register_sequence_parallel_allreduce_hooks, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + register_sequence_parallel_allreduce_hooks, + ) +except: + pass from ppfleetx.utils.log import logger # TODO(haohongxiang): to solve the problem of cross-reference diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 2ee9d7733f41..05fa5775399e 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -29,16 +29,20 @@ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin from .image_processing_utils import ImageProcessingMixin from .attention_utils import create_bigbird_rand_mask_idx_list -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - GatherOp, - ScatterOp, - AllGatherOp, - ReduceScatterOp, - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, - mark_as_sequence_parallel_parameter, - register_sequence_parallel_allreduce_hooks, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + GatherOp, + ScatterOp, + AllGatherOp, + ReduceScatterOp, + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + mark_as_sequence_parallel_parameter, + register_sequence_parallel_allreduce_hooks, + ) +except: + pass from .export import export_model # isort: split diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 50cfc892d336..8c066431979f 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -29,13 +29,17 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from paddle.utils import try_import diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py index 255763be395f..2e508339ab39 100644 --- a/paddlenlp/transformers/gpt/modeling_auto.py +++ b/paddlenlp/transformers/gpt/modeling_auto.py @@ -30,10 +30,14 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ScatterOp, - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from ...utils.converter import StateDictNameMapping from .. import PretrainedModel, register_base_model diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py index 3ec6b004edee..cd3dce018378 100644 --- a/paddlenlp/transformers/gpt/modeling_pp.py +++ b/paddlenlp/transformers/gpt/modeling_pp.py @@ -19,9 +19,13 @@ SharedLayerDesc, ) from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.model_utils import PipelinePretrainedModel diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index d4da1b195a94..b0b08c30241a 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -45,13 +45,16 @@ def swiglu(x, y=None): return F.silu(x) * y -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddle.utils import try_import from paddlenlp.transformers.conversion_utils import ( diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py index 7d669833e690..c39a78cc6252 100644 --- a/paddlenlp/transformers/mc2_seqence_parallel_linear.py +++ b/paddlenlp/transformers/mc2_seqence_parallel_linear.py @@ -23,10 +23,14 @@ from paddle import distributed as dist from paddle.autograd import PyLayer -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ) +except: + pass __all_gather_recomputation__ = False if int(os.getenv("MC2_Recompute", 0)): diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py index 592f9a47847a..7a8254d6877c 100644 --- a/paddlenlp/transformers/mixtral/modeling.py +++ b/paddlenlp/transformers/mixtral/modeling.py @@ -33,13 +33,16 @@ except ImportError: fused_rotary_position_embedding = None -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping, From dc5a6af3d33dc09890c00d900dd2bd8f0bf617c4 Mon Sep 17 00:00:00 2001 From: gongenlei Date: Mon, 22 Apr 2024 01:00:19 +0000 Subject: [PATCH 04/27] save_model: checkpoint_done --> model_done --- paddlenlp/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index be1af93c50fd..d8487364076b 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2096,10 +2096,10 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op if not self.is_in_train: self.args.unified_checkpoint_config = unified_checkpoint_config_backup if strtobool(os.getenv("FLAG_LLM_PDC", "False")): - # save checkpoint_done file to ensure checkpoint is complete + # save model_done file to ensure model is complete if self.args.should_save_model_state and self.args.should_save: # For ckpt integrity - paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done")) + paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done")) def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model" From 7314063128336138191f04c332a34929901f810c Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 23 Apr 2024 15:29:22 +0800 Subject: [PATCH 05/27] fix import --- paddlenlp/peft/lora/lora_layers.py | 272 ++++++++++++++++++++++++++++- paddlenlp/peft/lora/lora_model.py | 73 ++++++++ 2 files changed, 344 insertions(+), 1 deletion(-) diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py index e0c79c47a87a..66a0d0c0f520 100644 --- a/paddlenlp/peft/lora/lora_layers.py +++ b/paddlenlp/peft/lora/lora_layers.py @@ -27,11 +27,44 @@ from .lora_quick_layers import quick_lora -if "npu" in paddle.device.get_all_custom_device_type(): + +def is_mc2_valid(): + return "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")) + + +if is_mc2_valid(): + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + MC2ColumnSeqParallelLinear, + MC2RowSeqParallelLinear, + ) + from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear else: MC2LoRaRowParallelLinear = None MC2LoRaColumnParallelLinear = None + MC2ColumnSeqParallelLinear = None + MC2RowSeqParallelLinear = None + + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + AllGatherOp, + ColumnSequenceParallelLinear, + ReduceScatterOp, + RowSequenceParallelLinear, + mark_as_sequence_parallel_parameter, + ) +except: + + class ColumnSequenceParallelLinear: + pass + + class RowSequenceParallelLinear: + pass + + AllGatherOp = None + ReduceScatterOp = None + mark_as_sequence_parallel_parameter = None class LoRALinear(nn.Linear): @@ -298,6 +331,123 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" +class RowSequenceParallelLoRALinear(RowSequenceParallelLinear): + def __init__( + self, + in_features: int, + out_features: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + rslora: bool = False, + lora_plus_scale: float = 1.0, + merge_weights: bool = True, + use_quick_lora: bool = False, + pissa: bool = False, + **kwargs + ): + RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) + if not isinstance(r, int) or r <= 0: + raise ValueError("Lora rank r should be a positive integer") + if pissa: + raise ValueError("Pissa is not supported in model parallel by now") + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.0: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + self.merge_weights = merge_weights + + # compatible + self.name = self._name + + # Actual trainable parameters + self.lora_A = self.create_parameter( + shape=[self.input_size_per_partition, r], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu") + ), + ) + self.lora_B = self.create_parameter( + shape=[r, self.out_features], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value=0.0), + learning_rate=lora_plus_scale, + ), + ) + + self.lora_A.is_distributed = True + self.lora_A.split_axis = 0 + self.lora_B.is_distributed = False + mark_as_sequence_parallel_parameter(self.lora_B) + if not rslora: + self.scaling = self.lora_alpha / self.r + else: + self.scaling = self.lora_alpha / math.sqrt(self.r) + + # Freezing the pre-trained weight matrix + self.weight.stop_gradient = True + self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 + + @property + def use_quick_lora(self): + # TODO(@gexiao): support qlora + return False # self._use_quick_lora and self.training and not self.merged + + def train(self): + super().train() + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = False + + def eval(self): + super().eval() + if self.merge_weights and not self.merged: + # Merge the weights and mark it + new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = True + + def forward(self, x: paddle.Tensor): + if not self.input_is_parallel: + input_mp = mp_ops._c_split(x, group=self.model_parallel_group) + else: + input_mp = x + + if not is_mc2_valid(): + output_parallel = self.linear(input_mp, self.weight, name=self._name) + output_ = ReduceScatterOp.apply(output_parallel) + result_mp = output_ + self.bias if self.bias is not None else output_ + else: + output_ = MC2RowSeqParallelLinear.apply(input_mp, self.weight, self.model_parallel_group) + result_mp = output_ + self.bias if self.bias is not None else output_ + + if not self.merged: + input_mp = self.lora_dropout(input_mp) + if not is_mc2_valid(): + input_mp = input_mp @ self.lora_A + input_mp = ReduceScatterOp.apply(input_mp) + else: + input_mp = MC2RowSeqParallelLinear.apply(input_mp, self.lora_A, self.model_parallel_group) + delta_mp = (input_mp @ self.lora_B) * self.scaling + result_mp += delta_mp + return result_mp + + def extra_repr(self): + name = f", name={self.name}" if self.name else "" + return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" + + class ColumnParallelLoRALinear(ColumnParallelLinear): def __init__( self, @@ -428,6 +578,126 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" +class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear): + def __init__( + self, + in_features: int, + out_features: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + rslora: bool = False, + lora_plus_scale: float = 1.0, + merge_weights: bool = True, + lora_A_weight_attr: Optional[paddle.ParamAttr] = None, + use_quick_lora: bool = False, + pissa: bool = False, + **kwargs + ): + ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) + if not isinstance(r, int) or r <= 0: + raise ValueError("Lora rank r should be a positive integer") + if pissa: + raise ValueError("Pissa is not supported in model parallel by now") + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.0: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + self.merge_weights = merge_weights + + # compatible + self.name = self._name + + # Actual trainable parameters + self.lora_A = self.create_parameter( + shape=[in_features, r], + dtype=self._dtype, + is_bias=False, + attr=lora_A_weight_attr, + ) + self.lora_A.is_distributed = False + mark_as_sequence_parallel_parameter(self.lora_A) + + self.lora_B = self.create_parameter( + shape=[r, self.output_size_per_partition], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value=0.0), + learning_rate=lora_plus_scale, + ), + ) + + self.lora_B.is_distributed = True + self.lora_B.split_axis = 1 + if not rslora: + self.scaling = self.lora_alpha / self.r + else: + self.scaling = self.lora_alpha / math.sqrt(self.r) + + # Freezing the pre-trained weight matrix + self.weight.stop_gradient = True + self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 + + @property + def use_quick_lora(self): + # TODO(@gexiao): support qlora + return False # self._use_quick_lora and self.training and not self.merged + + def train(self): + super().train() + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = False + + def eval(self): + super().eval() + if self.merge_weights and not self.merged: + # Merge the weights and mark it + new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = True + + def forward(self, x: paddle.Tensor): + if not is_mc2_valid(): + if self.is_mp: + input_parallel = AllGatherOp.apply(x) + else: + input_parallel = x + result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name) + else: + result_mp = MC2ColumnSeqParallelLinear.apply(x, self.weight, self.model_parallel_group) + if self.bias is not None: + result_mp += self.bias + + if not self.merged: + input_a = self.lora_dropout(x) @ self.lora_A + if not is_mc2_valid(): + input_a = AllGatherOp.apply(input_a) + delta_mp = (input_a @ self.lora_B) * self.scaling + else: + input_a = MC2ColumnSeqParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group) + delta_mp = input_a * self.scaling + result_mp += delta_mp + + if self.gather_output and self.is_mp: + result = mp_ops._c_concat(result_mp, group=self.model_parallel_group) + else: + result = result_mp + return result + + def extra_repr(self): + name = f", name={self.name}" if self.name else "" + return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" + + class LoRAMergedLinear(nn.Linear): # LoRA implemented in a dense layer with merged linear weights for q, k, v def __init__( diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py index 1bbd0284823c..57d3bb3f2205 100644 --- a/paddlenlp/peft/lora/lora_model.py +++ b/paddlenlp/peft/lora/lora_model.py @@ -48,10 +48,12 @@ from .lora_layers import ( ColumnParallelLoRALinear, ColumnParallelLoRAMergedLinear, + ColumnSequenceParallelLoRALinear, LoRAConv2D, LoRALinear, LoRAMergedLinear, RowParallelLoRALinear, + RowSequenceParallelLoRALinear, ) try: @@ -73,6 +75,19 @@ ColumnParallelQuantizationLoRALinear = None RowParallelQuantizationLoRALinear = None +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ) +except: + + class ColumnSequenceParallelLinear: + pass + + class RowSequenceParallelLinear: + pass + class LoRAModel(nn.Layer): # TODO:lugimzzz support restore in following PR @@ -454,6 +469,60 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora) # Lora column parallel will spilt lora A matrix self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) + # for lora qat + if self.lora_config.do_qat: + self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) + elif isinstance(module, ColumnSequenceParallelLinear): + # recover the original output_features + output_features = module.weight.shape[1] * module.world_size + lora_module = ColumnSequenceParallelLoRALinear( + in_features=module.weight.shape[0], + out_features=output_features, + gather_output=module.gather_output, + has_bias=module.bias is not None, + r=lora_config.r, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + rslora=lora_config.rslora, + lora_plus_scale=lora_config.lora_plus_scale, + pissa=lora_config.pissa, + merge_weights=lora_config.merge_weights, + lora_A_weight_attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + negative_slope=math.sqrt(5), nonlinearity="leaky_relu" + ) + ), + use_quick_lora=lora_config.use_quick_lora, + ) + # Lora column parallel will spilt lora B matrix + self.add_lora_split_mapping(module_name + ".lora_B", is_column=True) + + # for lora qat + if self.lora_config.do_qat: + self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True) + self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) + elif isinstance(module, RowSequenceParallelLinear): + # recover the original output_features + lora_module = RowSequenceParallelLoRALinear( + in_features=module.weight.shape[0] * module.world_size, + out_features=module.weight.shape[1], + has_bias=module.bias is not None, + input_is_parallel=module.input_is_parallel, + r=lora_config.r, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + rslora=lora_config.rslora, + lora_plus_scale=lora_config.lora_plus_scale, + pissa=lora_config.pissa, + merge_weights=lora_config.merge_weights, + use_quick_lora=lora_config.use_quick_lora, + ) + # Lora column parallel will spilt lora A matrix + self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) + # for lora qat if self.lora_config.do_qat: self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) @@ -597,6 +666,8 @@ def mark_only_lora_as_trainable(self) -> None: or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRALinear) or isinstance(layer, RowParallelLoRALinear) + or isinstance(layer, ColumnSequenceParallelLoRALinear) + or isinstance(layer, RowSequenceParallelLoRALinear) or isinstance(layer, LoRAMergedLinear) or isinstance(layer, ColumnParallelLoRAMergedLinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) @@ -684,9 +755,11 @@ def restore_original_model(self): self._find_and_restore_module(layer_name) elif ( isinstance(layer, ColumnParallelLoRALinear) + or isinstance(layer, ColumnSequenceParallelLoRALinear) or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRAMergedLinear) or isinstance(layer, RowParallelLoRALinear) + or isinstance(layer, RowSequenceParallelLoRALinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) or ( ColumnParallelQuantizationLoRALinear is not None From d4062e576dfa9b77063a273b2410051411f279b6 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 23 Apr 2024 15:30:50 +0800 Subject: [PATCH 06/27] Revert "fix import" This reverts commit 7314063128336138191f04c332a34929901f810c. --- paddlenlp/peft/lora/lora_layers.py | 272 +---------------------------- paddlenlp/peft/lora/lora_model.py | 73 -------- 2 files changed, 1 insertion(+), 344 deletions(-) diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py index 66a0d0c0f520..e0c79c47a87a 100644 --- a/paddlenlp/peft/lora/lora_layers.py +++ b/paddlenlp/peft/lora/lora_layers.py @@ -27,44 +27,11 @@ from .lora_quick_layers import quick_lora - -def is_mc2_valid(): - return "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")) - - -if is_mc2_valid(): - from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - MC2ColumnSeqParallelLinear, - MC2RowSeqParallelLinear, - ) - +if "npu" in paddle.device.get_all_custom_device_type(): from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear else: MC2LoRaRowParallelLinear = None MC2LoRaColumnParallelLinear = None - MC2ColumnSeqParallelLinear = None - MC2RowSeqParallelLinear = None - - -try: - from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - AllGatherOp, - ColumnSequenceParallelLinear, - ReduceScatterOp, - RowSequenceParallelLinear, - mark_as_sequence_parallel_parameter, - ) -except: - - class ColumnSequenceParallelLinear: - pass - - class RowSequenceParallelLinear: - pass - - AllGatherOp = None - ReduceScatterOp = None - mark_as_sequence_parallel_parameter = None class LoRALinear(nn.Linear): @@ -331,123 +298,6 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" -class RowSequenceParallelLoRALinear(RowSequenceParallelLinear): - def __init__( - self, - in_features: int, - out_features: int, - r: int = 0, - lora_alpha: int = 1, - lora_dropout: float = 0.0, - rslora: bool = False, - lora_plus_scale: float = 1.0, - merge_weights: bool = True, - use_quick_lora: bool = False, - pissa: bool = False, - **kwargs - ): - RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) - if not isinstance(r, int) or r <= 0: - raise ValueError("Lora rank r should be a positive integer") - if pissa: - raise ValueError("Pissa is not supported in model parallel by now") - self.r = r - self.lora_alpha = lora_alpha - # Optional dropout - if lora_dropout > 0.0: - self.lora_dropout = nn.Dropout(p=lora_dropout) - else: - self.lora_dropout = lambda x: x - # Mark the weight as unmerged - self.merged = False - self.merge_weights = merge_weights - - # compatible - self.name = self._name - - # Actual trainable parameters - self.lora_A = self.create_parameter( - shape=[self.input_size_per_partition, r], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu") - ), - ) - self.lora_B = self.create_parameter( - shape=[r, self.out_features], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(value=0.0), - learning_rate=lora_plus_scale, - ), - ) - - self.lora_A.is_distributed = True - self.lora_A.split_axis = 0 - self.lora_B.is_distributed = False - mark_as_sequence_parallel_parameter(self.lora_B) - if not rslora: - self.scaling = self.lora_alpha / self.r - else: - self.scaling = self.lora_alpha / math.sqrt(self.r) - - # Freezing the pre-trained weight matrix - self.weight.stop_gradient = True - self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 - - @property - def use_quick_lora(self): - # TODO(@gexiao): support qlora - return False # self._use_quick_lora and self.training and not self.merged - - def train(self): - super().train() - if self.merge_weights and self.merged: - # Make sure that the weights are not merged - new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling - self.weight.set_value(new_weight) - self.merged = False - - def eval(self): - super().eval() - if self.merge_weights and not self.merged: - # Merge the weights and mark it - new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling - self.weight.set_value(new_weight) - self.merged = True - - def forward(self, x: paddle.Tensor): - if not self.input_is_parallel: - input_mp = mp_ops._c_split(x, group=self.model_parallel_group) - else: - input_mp = x - - if not is_mc2_valid(): - output_parallel = self.linear(input_mp, self.weight, name=self._name) - output_ = ReduceScatterOp.apply(output_parallel) - result_mp = output_ + self.bias if self.bias is not None else output_ - else: - output_ = MC2RowSeqParallelLinear.apply(input_mp, self.weight, self.model_parallel_group) - result_mp = output_ + self.bias if self.bias is not None else output_ - - if not self.merged: - input_mp = self.lora_dropout(input_mp) - if not is_mc2_valid(): - input_mp = input_mp @ self.lora_A - input_mp = ReduceScatterOp.apply(input_mp) - else: - input_mp = MC2RowSeqParallelLinear.apply(input_mp, self.lora_A, self.model_parallel_group) - delta_mp = (input_mp @ self.lora_B) * self.scaling - result_mp += delta_mp - return result_mp - - def extra_repr(self): - name = f", name={self.name}" if self.name else "" - return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" - - class ColumnParallelLoRALinear(ColumnParallelLinear): def __init__( self, @@ -578,126 +428,6 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" -class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear): - def __init__( - self, - in_features: int, - out_features: int, - r: int = 0, - lora_alpha: int = 1, - lora_dropout: float = 0.0, - rslora: bool = False, - lora_plus_scale: float = 1.0, - merge_weights: bool = True, - lora_A_weight_attr: Optional[paddle.ParamAttr] = None, - use_quick_lora: bool = False, - pissa: bool = False, - **kwargs - ): - ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) - if not isinstance(r, int) or r <= 0: - raise ValueError("Lora rank r should be a positive integer") - if pissa: - raise ValueError("Pissa is not supported in model parallel by now") - self.r = r - self.lora_alpha = lora_alpha - # Optional dropout - if lora_dropout > 0.0: - self.lora_dropout = nn.Dropout(p=lora_dropout) - else: - self.lora_dropout = lambda x: x - # Mark the weight as unmerged - self.merged = False - self.merge_weights = merge_weights - - # compatible - self.name = self._name - - # Actual trainable parameters - self.lora_A = self.create_parameter( - shape=[in_features, r], - dtype=self._dtype, - is_bias=False, - attr=lora_A_weight_attr, - ) - self.lora_A.is_distributed = False - mark_as_sequence_parallel_parameter(self.lora_A) - - self.lora_B = self.create_parameter( - shape=[r, self.output_size_per_partition], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(value=0.0), - learning_rate=lora_plus_scale, - ), - ) - - self.lora_B.is_distributed = True - self.lora_B.split_axis = 1 - if not rslora: - self.scaling = self.lora_alpha / self.r - else: - self.scaling = self.lora_alpha / math.sqrt(self.r) - - # Freezing the pre-trained weight matrix - self.weight.stop_gradient = True - self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 - - @property - def use_quick_lora(self): - # TODO(@gexiao): support qlora - return False # self._use_quick_lora and self.training and not self.merged - - def train(self): - super().train() - if self.merge_weights and self.merged: - # Make sure that the weights are not merged - new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling - self.weight.set_value(new_weight) - self.merged = False - - def eval(self): - super().eval() - if self.merge_weights and not self.merged: - # Merge the weights and mark it - new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling - self.weight.set_value(new_weight) - self.merged = True - - def forward(self, x: paddle.Tensor): - if not is_mc2_valid(): - if self.is_mp: - input_parallel = AllGatherOp.apply(x) - else: - input_parallel = x - result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name) - else: - result_mp = MC2ColumnSeqParallelLinear.apply(x, self.weight, self.model_parallel_group) - if self.bias is not None: - result_mp += self.bias - - if not self.merged: - input_a = self.lora_dropout(x) @ self.lora_A - if not is_mc2_valid(): - input_a = AllGatherOp.apply(input_a) - delta_mp = (input_a @ self.lora_B) * self.scaling - else: - input_a = MC2ColumnSeqParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group) - delta_mp = input_a * self.scaling - result_mp += delta_mp - - if self.gather_output and self.is_mp: - result = mp_ops._c_concat(result_mp, group=self.model_parallel_group) - else: - result = result_mp - return result - - def extra_repr(self): - name = f", name={self.name}" if self.name else "" - return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" - - class LoRAMergedLinear(nn.Linear): # LoRA implemented in a dense layer with merged linear weights for q, k, v def __init__( diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py index 57d3bb3f2205..1bbd0284823c 100644 --- a/paddlenlp/peft/lora/lora_model.py +++ b/paddlenlp/peft/lora/lora_model.py @@ -48,12 +48,10 @@ from .lora_layers import ( ColumnParallelLoRALinear, ColumnParallelLoRAMergedLinear, - ColumnSequenceParallelLoRALinear, LoRAConv2D, LoRALinear, LoRAMergedLinear, RowParallelLoRALinear, - RowSequenceParallelLoRALinear, ) try: @@ -75,19 +73,6 @@ ColumnParallelQuantizationLoRALinear = None RowParallelQuantizationLoRALinear = None -try: - from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, - ) -except: - - class ColumnSequenceParallelLinear: - pass - - class RowSequenceParallelLinear: - pass - class LoRAModel(nn.Layer): # TODO:lugimzzz support restore in following PR @@ -469,60 +454,6 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora) # Lora column parallel will spilt lora A matrix self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) - # for lora qat - if self.lora_config.do_qat: - self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) - self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) - self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) - elif isinstance(module, ColumnSequenceParallelLinear): - # recover the original output_features - output_features = module.weight.shape[1] * module.world_size - lora_module = ColumnSequenceParallelLoRALinear( - in_features=module.weight.shape[0], - out_features=output_features, - gather_output=module.gather_output, - has_bias=module.bias is not None, - r=lora_config.r, - lora_alpha=lora_config.lora_alpha, - lora_dropout=lora_config.lora_dropout, - rslora=lora_config.rslora, - lora_plus_scale=lora_config.lora_plus_scale, - pissa=lora_config.pissa, - merge_weights=lora_config.merge_weights, - lora_A_weight_attr=paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - negative_slope=math.sqrt(5), nonlinearity="leaky_relu" - ) - ), - use_quick_lora=lora_config.use_quick_lora, - ) - # Lora column parallel will spilt lora B matrix - self.add_lora_split_mapping(module_name + ".lora_B", is_column=True) - - # for lora qat - if self.lora_config.do_qat: - self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True) - self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) - self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) - elif isinstance(module, RowSequenceParallelLinear): - # recover the original output_features - lora_module = RowSequenceParallelLoRALinear( - in_features=module.weight.shape[0] * module.world_size, - out_features=module.weight.shape[1], - has_bias=module.bias is not None, - input_is_parallel=module.input_is_parallel, - r=lora_config.r, - lora_alpha=lora_config.lora_alpha, - lora_dropout=lora_config.lora_dropout, - rslora=lora_config.rslora, - lora_plus_scale=lora_config.lora_plus_scale, - pissa=lora_config.pissa, - merge_weights=lora_config.merge_weights, - use_quick_lora=lora_config.use_quick_lora, - ) - # Lora column parallel will spilt lora A matrix - self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) - # for lora qat if self.lora_config.do_qat: self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) @@ -666,8 +597,6 @@ def mark_only_lora_as_trainable(self) -> None: or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRALinear) or isinstance(layer, RowParallelLoRALinear) - or isinstance(layer, ColumnSequenceParallelLoRALinear) - or isinstance(layer, RowSequenceParallelLoRALinear) or isinstance(layer, LoRAMergedLinear) or isinstance(layer, ColumnParallelLoRAMergedLinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) @@ -755,11 +684,9 @@ def restore_original_model(self): self._find_and_restore_module(layer_name) elif ( isinstance(layer, ColumnParallelLoRALinear) - or isinstance(layer, ColumnSequenceParallelLoRALinear) or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRAMergedLinear) or isinstance(layer, RowParallelLoRALinear) - or isinstance(layer, RowSequenceParallelLoRALinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) or ( ColumnParallelQuantizationLoRALinear is not None From 590cee9812d052e8664b2551133af485534ebac8 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Tue, 23 Apr 2024 19:22:35 +0800 Subject: [PATCH 07/27] Support Llama3 (#8315) * support llama-3 * Add llama-3 tokenizer * fix for llama3 --- llm/finetune_generation.py | 3 +- paddlenlp/transformers/auto/tokenizer.py | 21 +- paddlenlp/transformers/llama/configuration.py | 2 + paddlenlp/transformers/llama/modeling.py | 9 +- paddlenlp/transformers/llama/tokenizer.py | 289 +++++++++++++++++- 5 files changed, 312 insertions(+), 12 deletions(-) diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py index 3a4def7db46c..df7a22a0cb95 100644 --- a/llm/finetune_generation.py +++ b/llm/finetune_generation.py @@ -45,6 +45,7 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, + Llama3Tokenizer, LlamaTokenizer, ) from paddlenlp.utils.log import logger @@ -232,7 +233,7 @@ def neft_post_hook(module, input, output): if tokenizer.chat_template is not None: data_args.eval_with_do_generation = False - if isinstance(tokenizer, LlamaTokenizer): + if isinstance(tokenizer, LlamaTokenizer) or isinstance(tokenizer, Llama3Tokenizer): tokenizer.pad_token_id = tokenizer.eos_token_id if data_args.dataset_name_or_path is None: diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 451468741ea1..083ab3037311 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -189,13 +189,20 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_ init_class = init_kwargs.pop("tokenizer_class", None) if init_class: - class_name = cls._name_mapping[init_class] - import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer") - tokenizer_class = getattr(import_class, init_class) - if use_fast: - fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name) - tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class - return tokenizer_class + if init_class in cls._name_mapping: + class_name = cls._name_mapping[init_class] + import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer") + tokenizer_class = getattr(import_class, init_class) + if use_fast: + fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name) + tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class + return tokenizer_class + else: + import_class = import_module("paddlenlp.transformers") + tokenizer_class = getattr(import_class, init_class, None) + assert tokenizer_class is not None, f"Can't find tokenizer {init_class}" + return tokenizer_class + # If no `init_class`, we use pattern recognition to recognize the tokenizer class. else: # TODO: Potential issue https://github.com/PaddlePaddle/PaddleNLP/pull/3786#discussion_r1024689810 diff --git a/paddlenlp/transformers/llama/configuration.py b/paddlenlp/transformers/llama/configuration.py index 68459f025fe4..e0b051b7434f 100644 --- a/paddlenlp/transformers/llama/configuration.py +++ b/paddlenlp/transformers/llama/configuration.py @@ -147,6 +147,7 @@ def __init__( num_key_value_heads=None, initializer_range=0.02, rms_norm_eps=1e-6, + rope_theta=10000.0, use_cache=True, use_recompute=False, recompute_granularity="full", @@ -188,6 +189,7 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps + self.rope_theta = rope_theta self.use_cache = use_cache self.use_recompute = use_recompute diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index b0b08c30241a..5cb13f7aa61a 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -813,24 +813,28 @@ def _init_rope(self): self.rotary_emb = LlamaRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "ntk": self.rotary_emb = LlamaNTKScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "dynamic_ntk": self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) else: raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}") @@ -903,6 +907,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) + if self.reshard_layer is not None: if self.sequence_parallel: assert self.seq_length % self.config.sep_parallel_degree == 0 @@ -1027,7 +1032,6 @@ def forward( value_states = paddle.concat([past_key_value[1], value_states], axis=1) past_key_value = (key_states, value_states) if use_cache else None - if self.kv_indices is not None: key_states = paddle.index_select(key_states, self.kv_indices, axis=2) value_states = paddle.index_select(value_states, self.kv_indices, axis=2) @@ -1036,7 +1040,7 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads # paddle version > 2.6 or develop support flash-attn with gqa/mqa paddle_version = float(paddle.__version__[:3]) - if (paddle_version != 0.0) and (paddle_version <= 2.6): + if not self.config.use_flash_attention or ((paddle_version != 0.0) and (paddle_version <= 2.6)): key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) @@ -1560,7 +1564,6 @@ def forward( else: attention_mask = attention_mask.astype("bool") hidden_states = inputs_embeds - # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index 4efaa48f797c..6f19530c05cb 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -24,7 +24,7 @@ from .. import PretrainedTokenizer from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy -__all__ = ["LlamaTokenizer"] +__all__ = ["LlamaTokenizer", "Llama3Tokenizer"] class LlamaTokenizer(PretrainedTokenizer): @@ -199,6 +199,7 @@ def create_token_type_ids_from_sequences( """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make use of token type ids, therefore a list of zeros is returned. + Args: token_ids_0 (`List[int]`): List of IDs. @@ -270,3 +271,289 @@ def _pad( constant_values=0, ) return encoded_inputs + + +"""Copied Tokenization classes for QWen.""" + +import base64 +import unicodedata +from typing import Collection, Dict, List, Optional, Set, Tuple, Union + +from ...utils.import_utils import is_tiktoken_available +from .. import PretrainedTokenizer +from ..tokenizer_utils_base import ( + AddedToken, + BatchEncoding, + EncodedInput, + PaddingStrategy, +) + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +BEGINOFTEXT = "<|begin_of_text|>" +ENDOFTEXT = "<|end_of_text|>" +IMSTART = "<|start_header_id|>" +IMEND = "<|end_header_id|>" +# as the default behavior is changed to allow special tokens in +# regular texts, the surface forms of special tokens need to be +# as different as possible to minimize the impact +EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250))) +SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:] + +tiktoken = None + + +def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: + with open(tiktoken_bpe_file, "rb") as f: + contents = f.read() + return { + base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line) + } + + +class Llama3Tokenizer(PretrainedTokenizer): + """QWen tokenizer.""" + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + resource_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + errors="replace", + padding_side="left", + **kwargs, + ): + super().__init__(**kwargs) + if not is_tiktoken_available(): + raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken") + + import tiktoken as tk + + tiktoken = tk + + self.errors = errors # how to handle errors in decoding + + self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int] + self.special_tokens = { + token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks)) + } + enc = tiktoken.Encoding( + "Llama3", + pat_str=PAT_STR, + mergeable_ranks=self.mergeable_ranks, + special_tokens=self.special_tokens, + ) + assert ( + len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab + ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding" + + self.decoder = {v: k for k, v in self.mergeable_ranks.items()} # type: dict[int, bytes|str] + self.decoder.update({v: k for k, v in self.special_tokens.items()}) + + self.tokenizer = enc # type: tiktoken.Encoding + + self.eod_id = self.special_tokens[ENDOFTEXT] + self.start_header_id = self.special_tokens[IMSTART] + self.end_header_id = self.special_tokens[IMEND] + + if "pad_token_id" in kwargs: + self.pad_token_id = kwargs["pad_token_id"] + if "eos_token_id" in kwargs: + self.eos_token_id = kwargs["eos_token_id"] + + def __len__(self) -> int: + return self.tokenizer.n_vocab + + def get_vocab(self) -> Dict[bytes, int]: + return self.mergeable_ranks + + def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]: + ids = [] + if isinstance(tokens, (str, bytes)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.mergeable_ranks.get(tokens) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.mergeable_ranks.get(token)) + return ids + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + if not special_tokens and new_tokens: + raise ValueError("Adding regular tokens is not supported") + for token in new_tokens: + surface_form = token.content if isinstance(token, AddedToken) else token + if surface_form not in SPECIAL_TOKENS: + raise ValueError("Adding unknown special tokens is not supported") + return 0 + + def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: + """ + Save only the vocabulary of the tokenizer (vocabulary). + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + file_path = os.path.join(save_directory, "tokenizer.model") + with open(file_path, "w", encoding="utf8") as w: + for k, v in self.mergeable_ranks.items(): + line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n" + w.write(line) + return (file_path,) + + def tokenize( + self, + text: str, + allowed_special: Union[Set, str] = "all", + disallowed_special: Union[Collection, str] = (), + **kwargs, + ) -> List[Union[bytes, str]]: + """ + Converts a string in a sequence of tokens. + + Args: + text (`str`): + The sequence to be encoded. + allowed_special (`Literal["all"]` or `set`): + The surface forms of the tokens to be encoded as special tokens in regular texts. + Default to "all". + disallowed_special (`Literal["all"]` or `Collection`): + The surface forms of the tokens that should not be in regular texts and trigger errors. + Default to an empty tuple. + + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific encode method. + + Returns: + `List[bytes|str]`: The list of tokens. + """ + tokens = [] + text = unicodedata.normalize("NFC", text) + + # this implementation takes a detour: text -> token id -> token surface forms + for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special): + tokens.append(self.decoder[t]) + return tokens + + def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str: + """ + Converts a sequence of tokens in a single string. + """ + text = "" + temp = b"" + for t in tokens: + if isinstance(t, str): + if temp: + text += temp.decode("utf-8", errors=self.errors) + temp = b"" + text += t + elif isinstance(t, bytes): + temp += t + else: + raise TypeError("token should only be of type types or str") + if temp: + text += temp.decode("utf-8", errors=self.errors) + return text + + @property + def vocab_size(self): + return self.tokenizer.n_vocab + + def _convert_id_to_token(self, index: int) -> Union[bytes, str]: + """Converts an id to a token, special tokens included""" + if index in self.decoder: + return self.decoder[index] + raise ValueError("unknown ids") + + def _convert_token_to_id(self, token: Union[bytes, str]) -> int: + """Converts a token to an id using the vocab, special tokens included""" + if token in self.special_tokens: + return self.special_tokens[token] + if token in self.mergeable_ranks: + return self.mergeable_ranks[token] + raise ValueError("unknown token") + + def _tokenize(self, text: str, **kwargs): + """ + Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based + vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + errors: str = None, + **kwargs, + ) -> str: + if isinstance(token_ids, int): + token_ids = [token_ids] + if skip_special_tokens: + token_ids = [i for i in token_ids if i < self.eod_id] + return self.tokenizer.decode(token_ids, errors=errors or self.errors) + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + + # attention_mask shape [1,seq_len,seq_len] + if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2: + attention_mask = encoded_inputs["attention_mask"] + encoded_inputs.pop("attention_mask") + else: + attention_mask = None + + required_input = encoded_inputs[self.model_input_names[0]] + encoded_inputs = super()._pad( + encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask + ) + if attention_mask is not None and len(np.shape(attention_mask)) > 2: + encoded_inputs["attention_mask"] = attention_mask + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + if needs_to_be_padded: + difference = max_length - len(required_input) + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = np.pad( + encoded_inputs["attention_mask"], + pad_width=[(0, 0), (difference, 0), (difference, 0)], + mode="constant", + constant_values=0, + ) + return encoded_inputs From 871070d95e9a48aaa47ebf5f7936532e856c02fa Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Wed, 24 Apr 2024 11:33:53 +0800 Subject: [PATCH 08/27] bug fixer (#8314) (#8318) --- paddlenlp/trainer/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index d8487364076b..3e8fc333fe95 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1011,6 +1011,7 @@ def _inner_training_loop( self.timers and self.timers("optimizer-step").start() if self.args.gradient_accumulation_steps > 1 and self._enable_delay_scale_loss(): + paddle.device.synchronize() for p in model._layers.parameters(): with paddle.no_grad(): if hasattr(p, "main_grad") and p.main_grad is not None: From 0f428bbe47daed3cd861f7047c3e9acbec4ea0b1 Mon Sep 17 00:00:00 2001 From: Tian <121000916+SylarTiaNII@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:18:11 +0800 Subject: [PATCH 09/27] [Distributed] [CustomDevices] Adapt SP on lora && polish MC2 APIs (#8303) * [Distributed] adapt sequence parallel on LoRA (#8235) * [Distributed] [CustomDevices] adapt lora sp && polish MC2 APIs --- paddlenlp/peft/lora/lora_layers.py | 278 ++++++++++++++++-- paddlenlp/peft/lora/lora_model.py | 85 +++++- paddlenlp/peft/lora/mc2_lora_npu.py | 80 ----- paddlenlp/transformers/llama/modeling.py | 25 +- paddlenlp/transformers/mc2_parallel_linear.py | 230 +++++++++++++++ .../mc2_seqence_parallel_linear.py | 146 --------- 6 files changed, 572 insertions(+), 272 deletions(-) delete mode 100644 paddlenlp/peft/lora/mc2_lora_npu.py create mode 100644 paddlenlp/transformers/mc2_parallel_linear.py delete mode 100644 paddlenlp/transformers/mc2_seqence_parallel_linear.py diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py index e0c79c47a87a..a31f7c3a33b1 100644 --- a/paddlenlp/peft/lora/lora_layers.py +++ b/paddlenlp/peft/lora/lora_layers.py @@ -13,7 +13,6 @@ # limitations under the License. import math -import os from typing import List, Optional import paddle @@ -25,13 +24,25 @@ RowParallelLinear, ) -from .lora_quick_layers import quick_lora +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + AllGatherOp, + ColumnSequenceParallelLinear, + ReduceScatterOp, + RowSequenceParallelLinear, + mark_as_sequence_parallel_parameter, + ) +except: + pass + +from paddlenlp.transformers.mc2_parallel_linear import ( + MC2ColumnParallelCoreLinear, + MC2ColumnSeqParallelCoreLinear, + MC2RowParallelCoreLinear, + MC2RowSeqParallelCoreLinear, +) -if "npu" in paddle.device.get_all_custom_device_type(): - from .mc2_lora_npu import MC2LoRaColumnParallelLinear, MC2LoRaRowParallelLinear -else: - MC2LoRaRowParallelLinear = None - MC2LoRaColumnParallelLinear = None +from .lora_quick_layers import quick_lora class LoRALinear(nn.Linear): @@ -266,9 +277,7 @@ def forward(self, x: paddle.Tensor): ) else: # x @ W : [bz, in_f / ws] ===> [bz, out_f] - if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")): - output = MC2LoRaRowParallelLinear.apply(input_mp, self.weight, self.model_parallel_group) - else: + if MC2RowParallelCoreLinear is None: result_mp = F.linear(x=input_mp, weight=self.weight, name=self.name) output = mp_ops._mp_allreduce( result_mp, @@ -276,6 +285,8 @@ def forward(self, x: paddle.Tensor): use_calc_stream=True, use_model_parallel=True, ) + else: + output = MC2RowParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group) if not self.merged: # x @ A: [bz, in_f/ ws] ===> [bz, r] @@ -298,6 +309,120 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" +class RowSequenceParallelLoRALinear(RowSequenceParallelLinear): + def __init__( + self, + in_features: int, + out_features: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + rslora: bool = False, + lora_plus_scale: float = 1.0, + merge_weights: bool = True, + use_quick_lora: bool = False, + **kwargs + ): + RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) + if not isinstance(r, int) or r <= 0: + raise ValueError("Lora rank r should be a positive integer") + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.0: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + self.merge_weights = merge_weights + + # compatible + self.name = self._name + + # Actual trainable parameters + self.lora_A = self.create_parameter( + shape=[self.input_size_per_partition, r], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu") + ), + ) + self.lora_B = self.create_parameter( + shape=[r, self.out_features], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value=0.0), + learning_rate=lora_plus_scale, + ), + ) + + self.lora_A.is_distributed = True + self.lora_A.split_axis = 0 + self.lora_B.is_distributed = False + mark_as_sequence_parallel_parameter(self.lora_B) + if not rslora: + self.scaling = self.lora_alpha / self.r + else: + self.scaling = self.lora_alpha / math.sqrt(self.r) + + # Freezing the pre-trained weight matrix + self.weight.stop_gradient = True + self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 + + @property + def use_quick_lora(self): + # TODO(@gexiao): support qlora + return False # self._use_quick_lora and self.training and not self.merged + + def train(self): + super().train() + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = False + + def eval(self): + super().eval() + if self.merge_weights and not self.merged: + # Merge the weights and mark it + new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = True + + def forward(self, x: paddle.Tensor): + if not self.input_is_parallel: + input_mp = mp_ops._c_split(x, group=self.model_parallel_group) + else: + input_mp = x + + if MC2RowSeqParallelCoreLinear is None: + output_parallel = self.linear(input_mp, self.weight, name=self._name) + output_ = ReduceScatterOp.apply(output_parallel) + result_mp = output_ + self.bias if self.bias is not None else output_ + else: + output_ = MC2RowSeqParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group) + result_mp = output_ + self.bias if self.bias is not None else output_ + + if not self.merged: + input_mp = self.lora_dropout(input_mp) + if MC2RowSeqParallelCoreLinear is None: + input_mp = input_mp @ self.lora_A + input_mp = ReduceScatterOp.apply(input_mp) + else: + input_mp = MC2RowSeqParallelCoreLinear.apply(input_mp, self.lora_A, self.model_parallel_group) + delta_mp = (input_mp @ self.lora_B) * self.scaling + result_mp += delta_mp + return result_mp + + def extra_repr(self): + name = f", name={self.name}" if self.name else "" + return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" + + class ColumnParallelLoRALinear(ColumnParallelLinear): def __init__( self, @@ -400,21 +525,21 @@ def forward(self, input: paddle.Tensor): world_size=self.world_size, ) else: - if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")): - res_mp = MC2LoRaColumnParallelLinear.apply(input, self.weight, self.model_parallel_group) - result_mp = res_mp + self.bias - else: + if MC2ColumnParallelCoreLinear is None: input_mp = mp_ops._c_identity(input, group=self.model_parallel_group) result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name) + else: + res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group) + result_mp = res_mp + self.bias if not self.merged: input_a = self.lora_dropout(input) @ self.lora_A - if "npu" in paddle.device.get_all_custom_device_type() and int(os.getenv("MC2", "0")): - tmp = MC2LoRaColumnParallelLinear.apply(input_a, self.lora_B, self.model_parallel_group) - delta_mp = tmp * self.scaling - else: + if MC2ColumnParallelCoreLinear is None: input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group) delta_mp = (input_a_mp @ self.lora_B) * self.scaling + else: + tmp = MC2ColumnParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group) + delta_mp = tmp * self.scaling result_mp += delta_mp if self.gather_output and self.is_mp: @@ -428,6 +553,123 @@ def extra_repr(self): return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" +class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear): + def __init__( + self, + in_features: int, + out_features: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + rslora: bool = False, + lora_plus_scale: float = 1.0, + merge_weights: bool = True, + lora_A_weight_attr: Optional[paddle.ParamAttr] = None, + use_quick_lora: bool = False, + **kwargs + ): + ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs) + if not isinstance(r, int) or r <= 0: + raise ValueError("Lora rank r should be a positive integer") + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.0: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + self.merge_weights = merge_weights + + # compatible + self.name = self._name + + # Actual trainable parameters + self.lora_A = self.create_parameter( + shape=[in_features, r], + dtype=self._dtype, + is_bias=False, + attr=lora_A_weight_attr, + ) + self.lora_A.is_distributed = False + mark_as_sequence_parallel_parameter(self.lora_A) + + self.lora_B = self.create_parameter( + shape=[r, self.output_size_per_partition], + dtype=self._dtype, + is_bias=False, + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(value=0.0), + learning_rate=lora_plus_scale, + ), + ) + + self.lora_B.is_distributed = True + self.lora_B.split_axis = 1 + if not rslora: + self.scaling = self.lora_alpha / self.r + else: + self.scaling = self.lora_alpha / math.sqrt(self.r) + + # Freezing the pre-trained weight matrix + self.weight.stop_gradient = True + self._use_quick_lora = use_quick_lora and lora_dropout == 0.0 + + @property + def use_quick_lora(self): + # TODO(@gexiao): support qlora + return False # self._use_quick_lora and self.training and not self.merged + + def train(self): + super().train() + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = False + + def eval(self): + super().eval() + if self.merge_weights and not self.merged: + # Merge the weights and mark it + new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling + self.weight.set_value(new_weight) + self.merged = True + + def forward(self, x: paddle.Tensor): + if MC2ColumnSeqParallelCoreLinear is None: + if self.is_mp: + input_parallel = AllGatherOp.apply(x) + else: + input_parallel = x + result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name) + else: + result_mp = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group) + if self.bias is not None: + result_mp += self.bias + + if not self.merged: + input_a = self.lora_dropout(x) @ self.lora_A + if MC2ColumnSeqParallelCoreLinear is None: + input_a = AllGatherOp.apply(input_a) + delta_mp = (input_a @ self.lora_B) * self.scaling + else: + input_a = MC2ColumnSeqParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group) + delta_mp = input_a * self.scaling + result_mp += delta_mp + + if self.gather_output and self.is_mp: + result = mp_ops._c_concat(result_mp, group=self.model_parallel_group) + else: + result = result_mp + return result + + def extra_repr(self): + name = f", name={self.name}" if self.name else "" + return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" + + class LoRAMergedLinear(nn.Linear): # LoRA implemented in a dense layer with merged linear weights for q, k, v def __init__( diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py index 1bbd0284823c..41ab1e681e24 100644 --- a/paddlenlp/peft/lora/lora_model.py +++ b/paddlenlp/peft/lora/lora_model.py @@ -45,14 +45,25 @@ from ...utils.env import LORA_WEIGHTS_NAME, SAFE_PEFT_WEIGHTS_INDEX_NAME from ...utils.log import logger from .lora_config import LoRAConfig -from .lora_layers import ( - ColumnParallelLoRALinear, - ColumnParallelLoRAMergedLinear, - LoRAConv2D, - LoRALinear, - LoRAMergedLinear, - RowParallelLoRALinear, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ) + + from .lora_layers import ( + ColumnParallelLoRALinear, + ColumnParallelLoRAMergedLinear, + ColumnSequenceParallelLoRALinear, + LoRAConv2D, + LoRALinear, + LoRAMergedLinear, + RowParallelLoRALinear, + RowSequenceParallelLoRALinear, + ) +except: + pass try: from ...quantization.quantization_linear import ( @@ -454,6 +465,58 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora) # Lora column parallel will spilt lora A matrix self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) + # for lora qat + if self.lora_config.do_qat: + self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) + elif isinstance(module, ColumnSequenceParallelLinear): + # recover the original output_features + output_features = module.weight.shape[1] * module.world_size + lora_module = ColumnSequenceParallelLoRALinear( + in_features=module.weight.shape[0], + out_features=output_features, + gather_output=module.gather_output, + has_bias=module.bias is not None, + r=lora_config.r, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + rslora=lora_config.rslora, + lora_plus_scale=lora_config.lora_plus_scale, + merge_weights=lora_config.merge_weights, + lora_A_weight_attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + negative_slope=math.sqrt(5), nonlinearity="leaky_relu" + ) + ), + use_quick_lora=lora_config.use_quick_lora, + ) + # Lora column parallel will spilt lora B matrix + self.add_lora_split_mapping(module_name + ".lora_B", is_column=True) + + # for lora qat + if self.lora_config.do_qat: + self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True) + self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False) + self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False) + elif isinstance(module, RowSequenceParallelLinear): + # recover the original output_features + lora_module = RowSequenceParallelLoRALinear( + in_features=module.weight.shape[0] * module.world_size, + out_features=module.weight.shape[1], + has_bias=module.bias is not None, + input_is_parallel=module.input_is_parallel, + r=lora_config.r, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + rslora=lora_config.rslora, + lora_plus_scale=lora_config.lora_plus_scale, + merge_weights=lora_config.merge_weights, + use_quick_lora=lora_config.use_quick_lora, + ) + # Lora column parallel will spilt lora A matrix + self.add_lora_split_mapping(module_name + ".lora_A", is_column=False) + # for lora qat if self.lora_config.do_qat: self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False) @@ -539,7 +602,7 @@ def _find_and_replace_module(self, model, module_name, lora_config, enable_lora) ) if lora_module is None: raise ValueError( - f"LoRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear. {module}({module_name}) is not supported。" + f"LoRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear or paddlenlp.transformers.sequence_utils. {module}({module_name} {type(module).__name__}) is not supported。" ) if getattr(lora_module, "quant_weight", None) is not None: lora_module.quant_weight = module.quant_weight @@ -597,6 +660,8 @@ def mark_only_lora_as_trainable(self) -> None: or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRALinear) or isinstance(layer, RowParallelLoRALinear) + or isinstance(layer, ColumnSequenceParallelLoRALinear) + or isinstance(layer, RowSequenceParallelLoRALinear) or isinstance(layer, LoRAMergedLinear) or isinstance(layer, ColumnParallelLoRAMergedLinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) @@ -684,9 +749,11 @@ def restore_original_model(self): self._find_and_restore_module(layer_name) elif ( isinstance(layer, ColumnParallelLoRALinear) + or isinstance(layer, ColumnSequenceParallelLoRALinear) or isinstance(layer, LoRAConv2D) or isinstance(layer, ColumnParallelLoRAMergedLinear) or isinstance(layer, RowParallelLoRALinear) + or isinstance(layer, RowSequenceParallelLoRALinear) or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear)) or ( ColumnParallelQuantizationLoRALinear is not None diff --git a/paddlenlp/peft/lora/mc2_lora_npu.py b/paddlenlp/peft/lora/mc2_lora_npu.py deleted file mode 100644 index 7ae47b1496f7..000000000000 --- a/paddlenlp/peft/lora/mc2_lora_npu.py +++ /dev/null @@ -1,80 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" mc2(tp overlap) """ - -import paddle -import paddle_custom_device -from paddle.autograd import PyLayer - - -class MC2LoRaRowParallelLinear(PyLayer): - @staticmethod - def forward(ctx, input_, weight, group): - ctx.save_for_backward(input_, weight) - rank = paddle.distributed.get_rank() - hcom_name = group.process_group.get_comm_name(rank) - x = input_.reshape([-1, input_.shape[-1]]) - out = paddle_custom_device.npu.fused_mm_allreduce( - x, weight, bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0 - ) - output = out.reshape([input_.shape[0], input_.shape[1], weight.shape[1]]) - ctx.ring_id = group.id - return output - - @staticmethod - def backward(ctx, dy): - input_, weight = ctx.saved_tensor() - out_grad = dy - sub_grad = out_grad.reshape([-1, out_grad.shape[-1]]) - input_grad = paddle.matmul(sub_grad, weight, transpose_y=True) - if weight.stop_gradient: - return input_grad.reshape(input_.shape) - else: - input_reshape = input_.reshape([-1, input_.shape[-1]]) - weight_grad = paddle.matmul(input_reshape, sub_grad, transpose_x=True) - return input_grad.reshape(input_.shape), weight_grad - - -class MC2LoRaColumnParallelLinear(PyLayer): - @staticmethod - def forward(ctx, input_, weight, group): - ctx.save_for_backward(input_, weight) - ctx.group = group - input_mp = input_ - result_mp = paddle.matmul(input_mp, weight) - return result_mp - - @staticmethod - def backward(ctx, dy): - input_, weight = ctx.saved_tensor() - sub_grad = dy.reshape([-1, dy.shape[-1]]) - rank = paddle.distributed.get_rank() - hcom_name = ctx.group.process_group.get_comm_name(rank) - - d_weight = ( - paddle.matmul(input_.reshape([-1, input_.shape[-1]]), sub_grad, transpose_x=True) - if not weight.stop_gradient - else None - ) - d_input = paddle_custom_device.npu.fused_mm_allreduce( - sub_grad, weight.t(), bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0 - ) - - if d_weight is not None: - return d_input.reshape(input_.shape), d_weight - else: - return d_input.reshape(input_.shape) diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 5cb13f7aa61a..38f1d244bdf2 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -62,6 +62,10 @@ def swiglu(x, y=None): init_name_mappings, ) from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies +from paddlenlp.transformers.mc2_parallel_linear import ( + MC2ColumnSeqParallelLinear, + MC2RowSeqParallelLinear, +) from paddlenlp.transformers.model_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, @@ -96,13 +100,6 @@ def swiglu(x, y=None): ] -def is_mc2_valid(): - current_device = get_env_device() - if current_device == "npu": - return True - return False - - def _get_interleave(n): def _get_interleave_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) @@ -574,12 +571,7 @@ def __init__(self, config): self.fuse_attention_ffn = config.fuse_attention_ffn if config.sequence_parallel: - if is_mc2_valid and int(os.getenv("FLAGS_NPU_MC2", 0)): - from paddlenlp.transformers.mc2_seqence_parallel_linear import ( - MC2ColumnSeqParallelLinear, - MC2RowSeqParallelLinear, - ) - + if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: ColumnParallelLinear = MC2ColumnSeqParallelLinear RowParallelLinear = MC2RowSeqParallelLinear else: @@ -697,12 +689,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): self.use_fused_rope = False if config.sequence_parallel: - if is_mc2_valid and int(os.getenv("FLAGS_NPU_MC2", 0)): - from paddlenlp.transformers.mc2_seqence_parallel_linear import ( - MC2ColumnSeqParallelLinear, - MC2RowSeqParallelLinear, - ) - + if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: ColumnParallelLinear = MC2ColumnSeqParallelLinear RowParallelLinear = MC2RowSeqParallelLinear else: diff --git a/paddlenlp/transformers/mc2_parallel_linear.py b/paddlenlp/transformers/mc2_parallel_linear.py new file mode 100644 index 000000000000..066e8074e21f --- /dev/null +++ b/paddlenlp/transformers/mc2_parallel_linear.py @@ -0,0 +1,230 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle + +try: + import paddle_custom_device +except ImportError: + pass + +from paddle import distributed as dist +from paddle.autograd import PyLayer + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ) +except: + pass +from paddlenlp.utils.tools import get_env_device + +__all_gather_recomputation__ = False +if int(os.getenv("MC2_Recompute", 0)): + __all_gather_recomputation__ = True + + +def is_mc2_valid(): + current_device = get_env_device() + if current_device == "npu": + return int(os.getenv("MC2", 0)) + return 0 + + +if is_mc2_valid(): + + class MC2ColumnParallelCoreLinear(PyLayer): + @staticmethod + def forward(ctx, input_, weight, group): + ctx.save_for_backward(input_, weight) + ctx.group = group + input_mp = input_ + result_mp = paddle.matmul(input_mp, weight) + return result_mp + + @staticmethod + def backward(ctx, dy): + input_, weight = ctx.saved_tensor() + sub_grad = dy.reshape([-1, dy.shape[-1]]) + rank = paddle.distributed.get_rank() + hcom_name = ctx.group.process_group.get_comm_name(rank) + + d_weight = ( + paddle.matmul(input_.reshape([-1, input_.shape[-1]]), sub_grad, transpose_x=True) + if not weight.stop_gradient + else None + ) + d_input = paddle_custom_device.npu.fused_mm_allreduce( + sub_grad, weight.t(), bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0 + ) + + if d_weight is not None: + return d_input.reshape(input_.shape), d_weight + else: + return d_input.reshape(input_.shape), None + + class MC2RowParallelCoreLinear(PyLayer): + @staticmethod + def forward(ctx, input_, weight, group): + ctx.save_for_backward(input_, weight) + rank = paddle.distributed.get_rank() + hcom_name = group.process_group.get_comm_name(rank) + x = input_.reshape([-1, input_.shape[-1]]) + out = paddle_custom_device.npu.fused_mm_allreduce( + x, weight, bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0 + ) + output = out.reshape([input_.shape[0], input_.shape[1], weight.shape[1]]) + ctx.ring_id = group.id + return output + + @staticmethod + def backward(ctx, dy): + input_, weight = ctx.saved_tensor() + out_grad = dy + sub_grad = out_grad.reshape([-1, out_grad.shape[-1]]) + input_grad = paddle.matmul(sub_grad, weight, transpose_y=True) + if weight.stop_gradient: + return input_grad.reshape(input_.shape), None + else: + input_reshape = input_.reshape([-1, input_.shape[-1]]) + weight_grad = paddle.matmul(input_reshape, sub_grad, transpose_x=True) + return input_grad.reshape(input_.shape), weight_grad + + class MC2ColumnSeqParallelCoreLinear(PyLayer): + @staticmethod + def forward(ctx, input_, weight, group): + ctx.weight_stop_gradient = weight.stop_gradient + ctx.save_for_backward(input_, weight) + + rank = dist.get_rank() + hcomm_info = group.process_group.get_comm_name(rank) + + world_size = group.nranks + output, gather_out = paddle_custom_device.npu.fused_allgather_mm( + input_, + weight, + bias=None, + hcom=hcomm_info, + world_size=world_size, + gather_index=0, + gather_output=(not __all_gather_recomputation__), + comm_turn=0, + ) + + ctx.all_gather_output = gather_out + ctx.world_size = world_size + ctx.group = group + return output + + @staticmethod + def backward(ctx, grad_output): + input_, weight = ctx.saved_tensor() + + if __all_gather_recomputation__: + dim_size = input_.shape + dim_size[0] = dim_size[0] * ctx.world_size + all_gather_output = paddle.empty(dim_size, dtype=input_.dtype) + all_gather_output.stop_gradient = True + all_gather_work = dist.stream.all_gather(all_gather_output, input_, group=ctx.group, sync_op=False) + else: + all_gather_output = ctx.all_gather_output + + grad_input = paddle.matmul(grad_output, weight, transpose_y=True) + sub_grad_input = paddle.empty(input_.shape, dtype=input_.dtype) + reduce_scatter_work = dist.stream.reduce_scatter( + sub_grad_input, grad_input, group=ctx.group, sync_op=False + ) + + if __all_gather_recomputation__: + all_gather_work.wait() + + grad_weight = ( + paddle.matmul(all_gather_output, grad_output, transpose_x=True) + if not ctx.weight_stop_gradient + else None + ) + reduce_scatter_work.wait() + + return sub_grad_input, grad_weight + + class MC2RowSeqParallelCoreLinear(PyLayer): + @staticmethod + def forward(ctx, input_, weight, group): + ctx.weight_stop_gradient = weight.stop_gradient + ctx.save_for_backward(input_, weight) + + rank = dist.get_rank() + hcomm_info = group.process_group.get_comm_name(rank) + world_size = group.nranks + + output = paddle_custom_device.npu.fused_mm_reduce_scatter( + input_, + weight, + bias=None, + hcom=hcomm_info, + world_size=world_size, + reduce_op="sum", + comm_turn=0, + ) + + ctx.hcomm_info = hcomm_info + ctx.world_size = world_size + return output + + @staticmethod + def backward(ctx, grad_output): + input_, weight = ctx.saved_tensor() + hcomm_info = ctx.hcomm_info + world_size = ctx.world_size + + grad_input, all_gather_grad_output = paddle_custom_device.npu.fused_allgather_mm( + grad_output, + weight.t(), + bias=None, + hcom=hcomm_info, + world_size=world_size, + gather_index=0, + gather_output=True, + comm_turn=0, + ) + grad_weight = ( + paddle.matmul(input_, all_gather_grad_output, transpose_x=True) + if not ctx.weight_stop_gradient + else None + ) + + return grad_input, grad_weight + + class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear): + def forward(self, x): + output = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group) + output = output + self.bias if self.bias is not None else output + return output + + class MC2RowSeqParallelLinear(RowSequenceParallelLinear): + def forward(self, x): + output = MC2RowSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group) + output = output + self.bias if self.bias is not None else output + return output + +else: + MC2ColumnSeqParallelCoreLinear = None + MC2RowSeqParallelCoreLinear = None + MC2ColumnSeqParallelLinear = None + MC2RowSeqParallelLinear = None + MC2ColumnParallelCoreLinear = None + MC2RowParallelCoreLinear = None diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py deleted file mode 100644 index c39a78cc6252..000000000000 --- a/paddlenlp/transformers/mc2_seqence_parallel_linear.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import paddle - -try: - import paddle_custom_device -except ImportError: - raise ImportError("Current device does not support MC2!") - -from paddle import distributed as dist -from paddle.autograd import PyLayer - -try: - from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, - ) -except: - pass - -__all_gather_recomputation__ = False -if int(os.getenv("MC2_Recompute", 0)): - __all_gather_recomputation__ = True - - -class MC2Column(PyLayer): - @staticmethod - def forward(ctx, input_, weight, group): - ctx.save_for_backward(input_, weight) - - rank = dist.get_rank() - hcomm_info = group.process_group.get_comm_name(rank) - - world_size = group.nranks - output, gather_out = paddle_custom_device.npu.fused_allgather_mm( - input_, - weight, - bias=None, - hcom=hcomm_info, - world_size=world_size, - gather_index=0, - gather_output=(not __all_gather_recomputation__), - comm_turn=0, - ) - - ctx.all_gather_output = gather_out - ctx.world_size = world_size - ctx.group = group - return output - - @staticmethod - def backward(ctx, grad_output): - input_, weight = ctx.saved_tensor() - - if __all_gather_recomputation__: - dim_size = input_.shape - dim_size[0] = dim_size[0] * ctx.world_size - all_gather_output = paddle.empty(dim_size, dtype=input_.dtype) - all_gather_output.stop_gradient = True - all_gather_work = dist.stream.all_gather(all_gather_output, input_, group=ctx.group, sync_op=False) - else: - all_gather_output = ctx.all_gather_output - - grad_input = paddle.matmul(grad_output, weight, transpose_y=True) - sub_grad_input = paddle.empty(input_.shape, dtype=input_.dtype) - reduce_scatter_work = dist.stream.reduce_scatter(sub_grad_input, grad_input, group=ctx.group, sync_op=False) - - if __all_gather_recomputation__: - all_gather_work.wait() - - grad_weight = paddle.matmul(all_gather_output, grad_output, transpose_x=True) - reduce_scatter_work.wait() - - return sub_grad_input, grad_weight - - -class MC2Row(PyLayer): - @staticmethod - def forward(ctx, input_, weight, group): - ctx.save_for_backward(input_, weight) - - rank = dist.get_rank() - hcomm_info = group.process_group.get_comm_name(rank) - world_size = group.nranks - - output = paddle_custom_device.npu.fused_mm_reduce_scatter( - input_, - weight, - bias=None, - hcom=hcomm_info, - world_size=world_size, - reduce_op="sum", - comm_turn=0, - ) - - ctx.hcomm_info = hcomm_info - ctx.world_size = world_size - return output - - @staticmethod - def backward(ctx, grad_output): - input_, weight = ctx.saved_tensor() - hcomm_info = ctx.hcomm_info - world_size = ctx.world_size - - grad_input, all_gather_grad_output = paddle_custom_device.npu.fused_allgather_mm( - grad_output, - weight.t(), - bias=None, - hcom=hcomm_info, - world_size=world_size, - gather_index=0, - gather_output=True, - comm_turn=0, - ) - grad_weight = paddle.matmul(input_, all_gather_grad_output, transpose_x=True) - - return grad_input, grad_weight - - -class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear): - def forward(self, x): - output = MC2Column.apply(x, self.weight, self.model_parallel_group) - output = output + self.bias if self.bias is not None else output - return output - - -class MC2RowSeqParallelLinear(RowSequenceParallelLinear): - def forward(self, x): - output = MC2Row.apply(x, self.weight, self.model_parallel_group) - output = output + self.bias if self.bias is not None else output - return output From 3105c18b013e1cdcbf860af1c6c54f4e33c88ee7 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Wed, 24 Apr 2024 15:10:53 +0800 Subject: [PATCH 10/27] fix 0f428bbe47daed3cd861f7047c3e9acbec4ea0b1 try import --- paddlenlp/peft/lora/lora_layers.py | 11 ++++++++++- paddlenlp/peft/lora/lora_model.py | 29 ++++++++++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py index a31f7c3a33b1..7ac40ed0ba66 100644 --- a/paddlenlp/peft/lora/lora_layers.py +++ b/paddlenlp/peft/lora/lora_layers.py @@ -33,7 +33,16 @@ mark_as_sequence_parallel_parameter, ) except: - pass + AllGatherOp = None + ReduceScatterOp = None + mark_as_sequence_parallel_parameter = None + + class ColumnSequenceParallelLinear: + pass + + class RowSequenceParallelLinear: + pass + from paddlenlp.transformers.mc2_parallel_linear import ( MC2ColumnParallelCoreLinear, diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py index 41ab1e681e24..ebadf39a6a55 100644 --- a/paddlenlp/peft/lora/lora_model.py +++ b/paddlenlp/peft/lora/lora_model.py @@ -52,18 +52,25 @@ RowSequenceParallelLinear, ) - from .lora_layers import ( - ColumnParallelLoRALinear, - ColumnParallelLoRAMergedLinear, - ColumnSequenceParallelLoRALinear, - LoRAConv2D, - LoRALinear, - LoRAMergedLinear, - RowParallelLoRALinear, - RowSequenceParallelLoRALinear, - ) except: - pass + + class ColumnSequenceParallelLinear: + pass + + class RowSequenceParallelLinear: + pass + + +from .lora_layers import ( + ColumnParallelLoRALinear, + ColumnParallelLoRAMergedLinear, + ColumnSequenceParallelLoRALinear, + LoRAConv2D, + LoRALinear, + LoRAMergedLinear, + RowParallelLoRALinear, + RowSequenceParallelLoRALinear, +) try: from ...quantization.quantization_linear import ( From 89daaa31776dcf22d34dcf2d830d82916452da20 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Fri, 26 Apr 2024 17:59:25 +0800 Subject: [PATCH 11/27] [Trainer] Fix sharding overlap bug (#8334) --- paddlenlp/trainer/training_args.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 8ebd218447fc..2ed9d343ceaa 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1020,6 +1020,11 @@ def __post_init__(self): enable_dp_comm_overlap and enable_sharding_comm_overlap ), "dp_comm_overlap and sharding_comm_overlap cannot be enabled at the same time" + if enable_sharding_comm_overlap and not self.amp_master_grad: + raise ValueError( + "If `enable_sharding_comm_overlap` in pipeline_parallel_configs, `amp_master_grad` must be True." + ) + dygraph_pp_configs = { "delay_scale_loss": True if "enable_delay_scale_loss" in pipeline_parallel_config else False, "dp_comm_overlap": enable_dp_comm_overlap, From 27d0e60cf7bcabce547b80b34c586bdf46d972a9 Mon Sep 17 00:00:00 2001 From: Kunbo Ding Date: Tue, 7 May 2024 17:52:04 +0800 Subject: [PATCH 12/27] Remove truncate (#8375) Remove truncate --- paddlenlp/trainer/trainer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 3e8fc333fe95..419349e02d21 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2721,11 +2721,15 @@ def evaluation_loop( # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of # samplers has been rounded to a multiple of batch_size, so we truncate. if all_losses is not None: - all_losses = all_losses[:num_samples] + all_losses = all_losses[: num_samples * int(self.args.world_size / self.args.dataset_world_size)] if all_preds is not None: - all_preds = nested_truncate(all_preds, num_samples) + all_preds = nested_truncate( + all_preds, num_samples * int(self.args.world_size / self.args.dataset_world_size) + ) if all_labels is not None: - all_labels = nested_truncate(all_labels, num_samples) + all_labels = nested_truncate( + all_labels, num_samples * int(self.args.world_size / self.args.dataset_world_size) + ) model.train() From 9e4a4f473322f53e647b43b9031568a705e48080 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Thu, 9 May 2024 11:22:23 +0800 Subject: [PATCH 13/27] Fix llama3 eot id. (#8373) --- paddlenlp/transformers/llama/tokenizer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index 6f19530c05cb..46c16c58b427 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -295,11 +295,12 @@ def _pad( ENDOFTEXT = "<|end_of_text|>" IMSTART = "<|start_header_id|>" IMEND = "<|end_header_id|>" +EOTID = "<|eot_id|>" # as the default behavior is changed to allow special tokens in # regular texts, the surface forms of special tokens need to be # as different as possible to minimize the impact -EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250))) -SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:] +EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251))) +SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:] tiktoken = None @@ -354,9 +355,11 @@ def __init__( self.tokenizer = enc # type: tiktoken.Encoding + self.bod_id = self.special_tokens[BEGINOFTEXT] self.eod_id = self.special_tokens[ENDOFTEXT] self.start_header_id = self.special_tokens[IMSTART] self.end_header_id = self.special_tokens[IMEND] + self.eot_id = self.special_tokens[EOTID] if "pad_token_id" in kwargs: self.pad_token_id = kwargs["pad_token_id"] From debb2ad92d08825f553818e88b971245b50d2433 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 13 May 2024 11:19:28 +0800 Subject: [PATCH 14/27] [Trainer] update distributed dataloader (#8426) * [DistDataloader] Update implementation, add nested.py (#8380) * fix distdataloader, fix eval with dp group (#8420) --- paddlenlp/data/dist_dataloader.py | 193 ++++++------------ .../trainer/plugins/unified_checkpoint.py | 21 +- paddlenlp/trainer/trainer.py | 88 +++++--- paddlenlp/trainer/utils/helper.py | 53 +---- paddlenlp/utils/nested.py | 83 ++++++++ 5 files changed, 216 insertions(+), 222 deletions(-) create mode 100644 paddlenlp/utils/nested.py diff --git a/paddlenlp/data/dist_dataloader.py b/paddlenlp/data/dist_dataloader.py index e97cc60c84a8..5d5c6cc7512c 100644 --- a/paddlenlp/data/dist_dataloader.py +++ b/paddlenlp/data/dist_dataloader.py @@ -12,13 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle from paddle.distributed import fleet from paddlenlp.utils.log import logger - -_MAX_DATA_DIM = 64 +from paddlenlp.utils.nested import ( + nested_broadcast_tensor, + nested_copy_place, + nested_empty_tensor, + nested_reduce_tensor, +) class DummyDataset(paddle.io.Dataset): @@ -53,6 +56,7 @@ def __init__( timeout=0, worker_init_fn=None, persistent_workers=False, + eval=False, ): if dataset is None: @@ -62,12 +66,15 @@ def __init__( super().__init__(dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=num_workers) self._hcg = fleet.get_hybrid_communicate_group() + self.eval = eval # Init pp data comm group. if self._hcg.get_pipe_parallel_world_size() > 1: self._pp_data_group = self._init_dataloader_comm_group() + self._pp_group = self._hcg.get_pipe_parallel_group() else: self._pp_data_group = None + self._pp_group = None self.mp_group = self._hcg.get_model_parallel_group() self.mp_rank = self._hcg.get_model_parallel_rank() @@ -78,10 +85,6 @@ def __init__( sharding_rank = self._hcg.get_sharding_parallel_rank() self._need_data = (self.mp_rank == 0) and (self.pp_rank == 0) - # When needed other data types, we can modify dtype_list. - self.dtype_list = [paddle.int64, paddle.float32, paddle.int32] - self._data_keys_list, self._data_keys_size = None, None - if self._need_data: self._dataloader = paddle.io.DataLoader( dataset, @@ -127,7 +130,6 @@ def _init_dataloader_comm_group(self): parallel_groups = topo.get_comm_list("pipe") for group in parallel_groups: - # only first rank and last rank ranks = [group[0], group[-1]] comm_group = paddle.distributed.new_group(ranks=ranks) if paddle.distributed.get_rank() in ranks: @@ -137,127 +139,68 @@ def _init_dataloader_comm_group(self): def __iter__(self): return self - def __next__(self): - data_keys_size = [0 for i in range(len(self.dtype_list))] - if self._need_data: - data = next(self._dataloader_iter) - data_keys = list(data.keys()) - - for key in data_keys: - if data[key].dtype not in self.dtype_list: - raise ValueError( - f"Dist dataloader requires dtype as `int64`, `float32` or `int32` currently, but got: {data[key].dtype}" + def _broadcast_data(self, data): + process_rank = paddle.distributed.get_rank() + if self.mp_group.nranks > 1: + if process_rank == self.mp_src_rank: + fake_data = [nested_reduce_tensor(data)] + else: + if data is not None: + logger.warning( + f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict." ) - - data_list, data_keys_list = [], [] - for i, dtype in enumerate(self.dtype_list): - data_list.append([data[key] for key in data_keys if data[key].dtype == dtype]) - data_keys_list.append([key for key in data_keys if data[key].dtype == dtype]) - data_keys_size = [len(keys) for keys in data_keys_list] - - # Broadcast data keys size. - if self._data_keys_size is None: - if self.mp_group.nranks > 1 and self.pp_rank == 0: - paddle.distributed.broadcast_object_list(data_keys_size, src=self.mp_src_rank, group=self.mp_group) - if self._pp_data_group is not None: - paddle.distributed.broadcast_object_list( - data_keys_size, src=self._pp_data_group.ranks[0], group=self._pp_data_group - ) - self._data_keys_size = data_keys_size - - if not self._need_data: - data_keys_list = [[None for i in range(keys_size)] for keys_size in self._data_keys_size] - - # Broadcast data keys name. - if self._data_keys_list is None: - if self.mp_group.nranks > 1 and self.pp_rank == 0: - paddle.distributed.broadcast_object_list(data_keys_list, src=self.mp_src_rank, group=self.mp_group) - if self._pp_data_group is not None: - paddle.distributed.broadcast_object_list( - data_keys_list, src=self._pp_data_group.ranks[0], group=self._pp_data_group - ) - self._data_keys_list = data_keys_list - - # Broadcast data. - if not self._need_data: - data_list = [[None for i in range(keys_size)] for keys_size in self._data_keys_size] - - if self.mp_group.nranks > 1 and self.pp_rank == 0: - for i, dtype in enumerate(self.dtype_list): - if self._data_keys_size[i] > 0: - data_list[i] = broadcast_data_list( - data_list[i], dtype, self.mp_rank, self.mp_group, self.mp_src_rank + fake_data = [None] + if self._pp_group is not None: + if process_rank == self._pp_group.ranks[0]: + fake_data = [nested_reduce_tensor(data)] + else: + if data is not None: + logger.warning( + f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict." ) + fake_data = [None] + if self.mp_group.nranks > 1 and self.pp_rank == 0: + paddle.distributed.broadcast_object_list( + fake_data, + src=self.mp_src_rank, + group=self.mp_group, + ) + if self._pp_group is not None: + paddle.distributed.broadcast_object_list( + fake_data, + src=self._pp_group.ranks[0], + group=self._pp_group, + ) - if self._pp_data_group is not None: - # Note(daisimng): In last stage of pp, we don't need input_ids. - # It will be removed in future. - for i, dtype in enumerate(self.dtype_list): - if self._data_keys_size[i] > 0: - data_list[i] = broadcast_data_list( - data_list[i], - dtype, - self.pp_rank, - self._pp_data_group, - self._pp_data_group.ranks[0], - ) - - out_data = {} - for keys, datas in zip(self._data_keys_list, data_list): - out_data.update([(k, d) for k, d in zip(keys, datas)]) - - return out_data - - -def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_rank=0): - """ - Broadcast data from src_rank to all ranks in comm_group. - """ - # Move to GPU and broadcast. - size_cpu = [] - if comm_rank == 0: - for data in data_list: - size_cpu.append(len(data.shape)) - size_cpu += data.shape - size_cpu = size_cpu + [0] * (_MAX_DATA_DIM - len(size_cpu)) - size_cuda = paddle.to_tensor(size_cpu) - paddle.distributed.broadcast(size_cuda, src_rank, group=comm_group).wait() - - size_cpu = size_cuda.tolist() - i = 0 - numel = 0 - sizes = [] - while size_cpu[i] > 0: - rank = size_cpu[i] - this_size = size_cpu[i + 1 : i + 1 + rank] - numel += int(np.prod(this_size)) - sizes.append(this_size) - i += rank + 1 - - if comm_rank == 0: - assert data.dtype == datatype, "input has data type {} which " "is different than {}".format( - data.dtype, datatype - ) - if paddle.is_compiled_with_cuda(): - data_b = paddle.concat([d.cuda().reshape([-1]) for d in data_list], 0) - else: - data_b = paddle.concat([d.reshape([-1]) for d in data_list], 0) + fake_data = fake_data[0] + if fake_data is None: + raise StopIteration - assert numel == sum([d.numel().item() for d in data_list]), (numel, [d.numel().item() for d in data_list]) - else: - if paddle.is_compiled_with_cuda(): - data_b = paddle.empty([numel], dtype=datatype).cuda() - else: - data_b = paddle.empty([numel], dtype=datatype) + dst_pp_group = self._pp_group if self.eval else self._pp_data_group + if self.mp_group.nranks > 1: + if process_rank != self.mp_src_rank: + data = nested_empty_tensor(fake_data) + if dst_pp_group is not None: + if process_rank != dst_pp_group.ranks[0]: + data = nested_empty_tensor(fake_data) - # Broadcast - paddle.distributed.broadcast(data_b, src_rank, group=comm_group).wait() + if self.mp_group.nranks > 1 and self.pp_rank == 0: + data = nested_broadcast_tensor(data, src=self.mp_src_rank, group=self.mp_group) + if dst_pp_group is not None: + data = nested_broadcast_tensor(data, src=dst_pp_group.ranks[0], group=dst_pp_group) + # for pp1 - pp_{n-1}, Paddle need to recevie empty dict for pipeline parallel. + if data is None: + data = {} - ret = [] - offset = 0 - for size in sizes: - numel = int(np.prod(size)) - ret.append(data_b[offset : offset + numel].reshape(size)) - offset += numel + return data - return ret + def __next__(self): + data = None + if self._need_data: + try: + data = next(self._dataloader_iter) + data = nested_copy_place(data, place=paddle.framework._current_expected_place()) + except: + pass + data = self._broadcast_data(data) + return data diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py index f8b62a15b77e..9a14ebba2882 100644 --- a/paddlenlp/trainer/plugins/unified_checkpoint.py +++ b/paddlenlp/trainer/plugins/unified_checkpoint.py @@ -62,6 +62,7 @@ SAFE_WEIGHTS_NAME, ) from paddlenlp.utils.log import logger +from paddlenlp.utils.nested import nested_copy, nested_copy_place if is_safetensors_available(): from safetensors import safe_open @@ -1876,26 +1877,6 @@ def mapping_optimizer_tp_actions(tp_actions, optimizer_loaded_keys): return new_actions -def nested_copy(inputs): - if isinstance(inputs, dict): - outputs = {} - for key in list(inputs.keys()): - outputs[key] = nested_copy(inputs[key]) - return outputs - return inputs - - -def nested_copy_place(inputs, place=None, blocking=False): - if isinstance(inputs, dict): - outputs = {} - for key in list(inputs.keys()): - outputs[key] = nested_copy_place(inputs[key], place, blocking) - return outputs - if isinstance(inputs, paddle.Tensor): - inputs = inputs if inputs.place == place else inputs._copy_to(place, blocking) - return inputs - - def flatten_list(nested_list): flattened_list = [] for item in nested_list: diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 419349e02d21..bf83420acf85 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1419,8 +1419,6 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa if is_datasets_available() and eval_dataset is not None and isinstance(eval_dataset, datasets.Dataset): eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") - _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader - if self._is_iterable_dataset(eval_dataset): if self.args.dataset_world_size > 1: eval_dataset = IterableDatasetShard( @@ -1431,24 +1429,41 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa process_index=self.args.dataset_rank, ) - return _DataLoader( - eval_dataset, - batch_size=self.args.per_device_eval_batch_size, - collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, - ) + if self.args.distributed_dataloader: + return DistDataLoader( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + collate_fn=self.data_collator, + num_workers=0, + eval=True, + ) + else: + return DataLoader( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + collate_fn=self.data_collator, + num_workers=0, + ) eval_sampler = self._get_eval_sampler(eval_dataset) if self.args.distributed_dataloader: logger.info("Eval using DistDataLoader.") - return _DataLoader( - eval_dataset, - batch_sampler=eval_sampler, - collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, - ) + return DistDataLoader( + eval_dataset, + batch_sampler=eval_sampler, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + eval=True, + ) + else: + return DataLoader( + eval_dataset, + batch_sampler=eval_sampler, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + ) def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: """ @@ -1469,8 +1484,6 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: if is_datasets_available() and test_dataset is not None and isinstance(test_dataset, datasets.Dataset): test_dataset = self._remove_unused_columns(test_dataset, description="test") - _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader - if self._is_iterable_dataset(test_dataset): if self.args.dataset_world_size > 1: test_dataset = IterableDatasetShard( @@ -1481,25 +1494,42 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: process_index=self.args.dataset_rank, ) - return _DataLoader( - test_dataset, - batch_size=self.args.per_device_eval_batch_size * self.world_size, - collate_fn=self.data_collator, # _get_collator_with_removed_columns - num_workers=self.args.dataloader_num_workers, - ) + if self.args.distributed_dataloader: + return DistDataLoader( + test_dataset, + batch_size=self.args.per_device_eval_batch_size * self.world_size, + collate_fn=self.data_collator, # _get_collator_with_removed_columns + num_workers=0, + eval=True, + ) + else: + return DataLoader( + test_dataset, + batch_size=self.args.per_device_eval_batch_size * self.world_size, + collate_fn=self.data_collator, # _get_collator_with_removed_columns + num_workers=0, + ) test_sampler = self._get_eval_sampler(test_dataset) if self.args.distributed_dataloader: logger.info("Test using DistDataLoader.") - # We use the same batch_size as for eval. - return _DataLoader( - test_dataset, - batch_sampler=test_sampler, - collate_fn=self.data_collator, - drop_last=self.args.dataloader_drop_last, - ) + # We use the same batch_size as for eval. + return DistDataLoader( + test_dataset, + batch_sampler=test_sampler, + collate_fn=self.data_collator, + drop_last=self.args.dataloader_drop_last, + eval=True, + ) + else: + return DataLoader( + test_dataset, + batch_sampler=test_sampler, + collate_fn=self.data_collator, + drop_last=self.args.dataloader_drop_last, + ) def create_optimizer_and_scheduler(self, num_training_steps: int): """ diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py index ff68e51f127b..25f593f71e35 100644 --- a/paddlenlp/trainer/utils/helper.py +++ b/paddlenlp/trainer/utils/helper.py @@ -16,8 +16,6 @@ # This file is modified from # https://github.com/huggingface/transformers/blob/main/src/transformers -import collections -import copy import os from typing import Any, Optional @@ -27,6 +25,11 @@ from paddle.distributed import fleet from paddlenlp.utils.log import logger +from paddlenlp.utils.nested import ( + nested_broadcast_tensor, + nested_empty_tensor, + nested_reduce_tensor, +) __all__ = [ "distributed_concat", @@ -180,52 +183,6 @@ def distributed_file(filename): return filename -TensorHolder = collections.namedtuple("TensorHolder", ["shape", "dtype", "name"]) - - -def nested_reduce_tensor(tensor): - if isinstance(tensor, dict): - # copy tensor since it will be inplace modified dict - tensor = copy.copy(tensor) - for key in list(tensor.keys()): - tensor[key] = nested_reduce_tensor(tensor[key]) - if isinstance(tensor, (tuple, list)): - return type(tensor)(nested_reduce_tensor(t) for t in tensor) - - if isinstance(tensor, paddle.Tensor): - return TensorHolder(tensor.shape, tensor.dtype, tensor.name) - - return tensor - - -def nested_empty_tensor(tensor): - if isinstance(tensor, dict): - for key in list(tensor.keys()): - tensor[key] = nested_empty_tensor(tensor[key]) - if isinstance(tensor, list): - return type(tensor)(nested_empty_tensor(t) for t in tensor) - - # TensorHolder is tuple - if isinstance(tensor, TensorHolder): - t = paddle.empty(tensor.shape, dtype=tensor.dtype, name=tensor.name) - t.name = tensor.name - return t - - return tensor - - -def nested_broadcast_tensor(tensor, src=0, group=None): - if isinstance(tensor, dict): - for key in list(tensor.keys()): - tensor[key] = nested_broadcast_tensor(tensor[key], src=src, group=group) - if isinstance(tensor, list): - return type(tensor)(nested_broadcast_tensor(t, src=src, group=group) for t in tensor) - - if isinstance(tensor, paddle.Tensor): - paddle.distributed.broadcast(tensor, src=src, group=group, sync_op=True) - return tensor - - def broadcast_dp_optimizer(state_dict): if paddle.distributed.get_world_size() <= 1: return state_dict diff --git a/paddlenlp/utils/nested.py b/paddlenlp/utils/nested.py new file mode 100644 index 000000000000..27942b8cb256 --- /dev/null +++ b/paddlenlp/utils/nested.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import copy + +import paddle + +TensorHolder = collections.namedtuple("TensorHolder", ["shape", "dtype", "name"]) + + +def nested_reduce_tensor(tensor): + if isinstance(tensor, dict): + # copy tensor since it will be inplace modified dict + tensor = copy.copy(tensor) + for key in list(tensor.keys()): + tensor[key] = nested_reduce_tensor(tensor[key]) + if isinstance(tensor, (tuple, list)): + return type(tensor)(nested_reduce_tensor(t) for t in tensor) + + if isinstance(tensor, paddle.Tensor): + return TensorHolder(tensor.shape, tensor.dtype, tensor.name) + + return tensor + + +def nested_empty_tensor(tensor): + if isinstance(tensor, dict): + for key in list(tensor.keys()): + tensor[key] = nested_empty_tensor(tensor[key]) + if isinstance(tensor, list): + return type(tensor)(nested_empty_tensor(t) for t in tensor) + + # TensorHolder is tuple + if isinstance(tensor, TensorHolder): + t = paddle.empty(tensor.shape, dtype=tensor.dtype, name=tensor.name) + t.name = tensor.name + return t + + return tensor + + +def nested_broadcast_tensor(tensor, src=0, group=None): + if isinstance(tensor, dict): + for key in list(tensor.keys()): + tensor[key] = nested_broadcast_tensor(tensor[key], src=src, group=group) + if isinstance(tensor, list): + return type(tensor)(nested_broadcast_tensor(t, src=src, group=group) for t in tensor) + + if isinstance(tensor, paddle.Tensor): + paddle.distributed.broadcast(tensor, src=src, group=group, sync_op=True) + return tensor + + +def nested_copy(inputs): + if isinstance(inputs, dict): + outputs = {} + for key in list(inputs.keys()): + outputs[key] = nested_copy(inputs[key]) + return outputs + return inputs + + +def nested_copy_place(inputs, place=None, blocking=False): + if isinstance(inputs, dict): + outputs = {} + for key in list(inputs.keys()): + outputs[key] = nested_copy_place(inputs[key], place, blocking) + return outputs + if isinstance(inputs, paddle.Tensor): + inputs = inputs if inputs.place == place else inputs._copy_to(place, blocking) + return inputs From fc860a3289804fbaf197d12c6d858d0d79e741af Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Thu, 16 May 2024 17:09:22 +0800 Subject: [PATCH 15/27] Fix load RNG compatibility. (#8451) --- paddlenlp/trainer/trainer.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index bf83420acf85..e1d59e4cb747 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1591,16 +1591,13 @@ def _load_rng_state(self, checkpoint): if os.path.isfile(rng_file): rng_file_list = paddle.load(rng_file, return_numpy=True) paddle.distributed.broadcast_object_list(rng_file_list, src=0) - # if rng_file_list still empty, then use old style rng_state + # if rng_file_list still empty, not log rng state. if rng_file_list[0] is None: - rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth") - if not os.path.isfile(rng_file): - logger.info( - f"Didn't find an RNG file for process {process_index}, if you are resuming a training that " - "wasn't launched in a distributed fashion, reproducibility is not guaranteed." - ) - return - checkpoint_rng_state = paddle.load(rng_file, return_numpy=True) + logger.info( + f"Didn't find an RNG file for process {process_index}, if you are resuming a training that " + "wasn't launched in a distributed fashion, reproducibility is not guaranteed." + ) + return else: checkpoint_rng_state = rng_file_list[process_index] else: From 08898bf1e0429db3da6d0b3e8a95e8b7d8c817d7 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Mon, 20 May 2024 13:05:30 +0800 Subject: [PATCH 16/27] Cherry-Pick fast_safe_open (#8458) * [Performance] Optimize unified checkpoint save/load speed. (#8204) * opt unified checkpoint save/load speed. --- .../trainer/plugins/unified_checkpoint.py | 51 +-- paddlenlp/trainer/trainer.py | 1 + paddlenlp/transformers/conversion_utils.py | 16 +- paddlenlp/transformers/model_utils.py | 51 ++- paddlenlp/utils/safetensors.py | 312 ++++++++++++++++++ tests/trainer/test_unified_checkpoint.py | 40 +++ .../clap/test_feature_extraction.py | 1 + .../ernie_vil/test_image_processing.py | 1 + .../speecht5/test_feature_extraction.py | 1 + tests/transformers/test_safetensors.py | 57 ++++ 10 files changed, 490 insertions(+), 41 deletions(-) create mode 100644 paddlenlp/utils/safetensors.py create mode 100644 tests/transformers/test_safetensors.py diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py index 9a14ebba2882..a8e1199a59b8 100644 --- a/paddlenlp/trainer/plugins/unified_checkpoint.py +++ b/paddlenlp/trainer/plugins/unified_checkpoint.py @@ -30,6 +30,7 @@ from paddlenlp.transformers.model_utils import ( PretrainedModel, _load_state_dict_into_model, + faster_set_state_dict, get_parameter_dtype, load_state_dict, unwrap_model, @@ -65,9 +66,10 @@ from paddlenlp.utils.nested import nested_copy, nested_copy_place if is_safetensors_available(): - from safetensors import safe_open + # from safetensors import safe_open from safetensors.numpy import save_file as safe_save_file + from paddlenlp.utils.safetensors import fast_safe_open as safe_open FP32_MASTER = "fp32_master_0" optimizer_scalar_name = [ @@ -91,6 +93,11 @@ async_save_queue = [] +DEST_PLACE = paddle.CPUPlace() +if paddle.device.is_compiled_with_cuda(): + DEST_PLACE = paddle.CUDAPinnedPlace() + + class UnifiedCheckpointOption(ExplicitEnum): """ "- skip_save_model_weight: do not save model weights when the masters weight exist\n" @@ -196,7 +203,6 @@ def load_unified_checkpoint(args, model, optimizer, resume_from_checkpoint: str, Returns: None """ - if paddle.distributed.get_world_size() <= 1: load_single_card_checkpoint(args, model, resume_from_checkpoint) return @@ -222,7 +228,6 @@ def load_unified_checkpoint_locally(args, model, resume_from_checkpoint: str, sa pretrained_model_name_or_path=resume_from_checkpoint, index_filename=os.path.join(resume_from_checkpoint, index_filename), ) - loaded_keys = sharded_metadata["all_checkpoint_keys"] model_state_dict = get_expected_state_dict(model) @@ -266,7 +271,9 @@ def _remove_unused_keys( else: tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True) # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors - state_dict = load_state_dict(shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys) + state_dict = load_state_dict( + shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected" + ) if not pre_tensor_parallel_split: # Since we load all keys but we only need one of pipeline stages @@ -279,11 +286,12 @@ def _remove_unused_keys( None, model.config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1 ) - error_msgs += _load_state_dict_into_model(model, state_dict, "") + # error_msgs += _load_state_dict_into_model(model, state_dict, "") + error_msgs += faster_set_state_dict(model, state_dict, strict_dtype=False) # force memory release del state_dict - gc.collect() + # gc.collect() if len(error_msgs) > 0: error_msg = "\n\t".join(error_msgs) @@ -337,6 +345,7 @@ def unified_checkpoint_into_shards( tp_actions = model_to_save.get_tensor_parallel_convert_actions( model_to_save.config, state_dict.keys(), is_split=False, ignore_error=True ) + logger.info("Unified model tensor parallel weights in shards") state_dict = merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys) # build index json file @@ -490,6 +499,7 @@ def load_unified_optimizer_locally(args, model, optimizer, resume_from_checkpoin # This should always be a list but, just to be sure. if not isinstance(resolved_archive_file, list): resolved_archive_file = [resolved_archive_file] + if len(resolved_archive_file) > 1: resolved_archive_file = tqdm(resolved_archive_file, desc="Loading optimizer shards") @@ -537,10 +547,10 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected tp_actions = mapping_optimizer_tp_actions(tp_actions, expected_keys) # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors - state_dict = load_state_dict(shard_file, tp_actions, expected_keys) + state_dict = load_state_dict(shard_file, tp_actions, expected_keys, device="expected") else: # for pipeline model, we don't need to use tp_actions - state_dict = load_state_dict(shard_file, None, expected_keys) + state_dict = load_state_dict(shard_file, None, expected_keys, device="expected") returned_state_dict.update(state_dict) # force memory release @@ -553,7 +563,6 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected state_dict_master_weight = load_resolved_archive_file( resolved_archive_file_mw, sharded_metadata_mw, expected_keys_mw, is_master_weights=True ) - # rename optimizer param for key in list(state_dict_optim.keys()): key_name = key.split("/") @@ -562,13 +571,13 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected key_name = "_".join([static_name, FP32_MASTER, key_name[1]]) else: key_name = "_".join([static_name, key_name[1]]) - returned_optim_state_dict[key_name] = state_dict_optim[key] + returned_optim_state_dict[key_name] = state_dict_optim.pop(key) returned_optim_state_dict[key_name].name = key_name if has_master_weights: for key in list(state_dict_master_weight.keys()): static_name = struct2static_name_mappings[key] - returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight[key] + returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight.pop(key) returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER]) returned_optim_state_dict = nested_copy_place( @@ -640,6 +649,7 @@ def unified_optimizer_into_shards( tp_actions = model.get_tensor_parallel_convert_actions( model.config, model_keys, is_split=False, ignore_error=True ) + logger.info("Unified optimizer tensor parallel in shards") optim_state_dict = merge_tensor_parallel_for_optimizer( optim_state_dict, tp_actions, @@ -648,6 +658,7 @@ def unified_optimizer_into_shards( paddle.device.cuda.empty_cache() if master_weights is not None: + logger.info("Unified master weight tensor parallel in shards") master_weights = merge_tensor_parallel_for_optimizer( master_weights, tp_actions, @@ -703,7 +714,6 @@ def unified_optimizer_into_shards( def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serialization=False): index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=False) index_filename = os.path.join(resume_from_checkpoint, index_filename) - # Find index json file and distribute this file in global group. if distributed_isfile(index_filename): distributed_file(index_filename) @@ -1605,7 +1615,9 @@ def gather_sharded_object(index_file, total_size, is_optimizer=False): tp_group = hcg.get_model_parallel_group() pp_group = hcg.get_pipe_parallel_group() - logger.info("Unified checkpoint generating sharded_index json files.") + logger.info( + f"Unified checkpoint: generating sharded_index json files for {'optimizer or master weight' if is_optimizer else 'model weight'}." + ) if tp_group.nranks > 1: dist.all_gather_object(index_file_list, index_file, tp_group) @@ -1714,8 +1726,6 @@ def filter_params(model_to_save, state_dict, is_optimizer=False): def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys): - logger.info("Unified checkpoint merge tensor parallel in shards") - hcg = fleet.get_hybrid_communicate_group() tp_group = hcg.get_model_parallel_group() tp_rank = tp_group.rank @@ -1741,7 +1751,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys): action = tp_actions.pop(key) tensor = action(ret) if is_dst else None else: - tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None + tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None if is_dst: state_dict_to_save[key] = tensor @@ -1754,8 +1764,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys): def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys): - logger.info("Unified optimizer tensor parallel in shards") - + # Core function for UC hcg = fleet.get_hybrid_communicate_group() tp_group = hcg.get_model_parallel_group() tp_rank = tp_group.rank @@ -1773,15 +1782,13 @@ def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys) if model_key in tp_actions: # for example: beta1, beta2 if tensor.numel().item() == 1: - tensor = ( - tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None - ) # Need broadcast when loaded + tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None # Need broadcast when loaded else: ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False) action = tp_actions[model_key] tensor = action(ret) if is_dst else None else: - tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None + tensor = tensor._copy_to(DEST_PLACE, False) if is_dst else None if is_dst: state_dict_to_save[filter_keys[i]] = tensor diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index e1d59e4cb747..746b7e252516 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2419,6 +2419,7 @@ def _load_optimizer_and_scheduler(self, checkpoint): self.runtime_timer.stop() return + logger.info("Loading optimizer and scheduler...") if (not self.args.should_load_sharding_stage1_model) and self.args.ignore_load_lr_and_optim: self.runtime_timer.stop() return diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py index 660e79f6a3e5..ba5169454d0b 100644 --- a/paddlenlp/transformers/conversion_utils.py +++ b/paddlenlp/transformers/conversion_utils.py @@ -285,8 +285,12 @@ def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2): if isinstance(weight_list[0], np.ndarray): return np.concatenate([reorder[i] for i in index], axis=axis) + else: + tensor = paddle.concat([reorder[i] for i in index], axis=axis) - return paddle.concat([reorder[i] for i in index], axis=axis)._copy_to(paddle.CPUPlace(), False) + if tensor.place.is_gpu_place(): + tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False) + return tensor def naive_fuse_split_tp( @@ -361,12 +365,18 @@ def normal_fuse_merge_tp(weight_list, is_column=True): if isinstance(weight_list[0], np.ndarray): return np.concatenate(weight_list, axis=-1) else: - return paddle.concat(weight_list, axis=-1)._copy_to(paddle.CPUPlace(), False) + tensor = paddle.concat(weight_list, axis=-1) + if tensor.place.is_gpu_place(): + tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False) + return tensor else: if isinstance(weight_list[0], np.ndarray): return np.concatenate(weight_list, axis=0) else: - return paddle.concat(weight_list, axis=0)._copy_to(paddle.CPUPlace(), False) + tensor = paddle.concat(weight_list, axis=0) + if tensor.place.is_gpu_place(): + tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False) + return tensor def normal_fuse_split_tp(weight, tensor_parallel_degree, tensor_parallel_rank=None, is_column=True): diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 1ddd7e1c2913..dc1c753206c4 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -109,10 +109,13 @@ def unwrap_optimizer(optimizer, optimizer_instances=()): if is_safetensors_available(): - from safetensors import safe_open - from safetensors.numpy import load_file as safe_load_file + # from safetensors import safe_open + # from safetensors.numpy import load_file as safe_load_file from safetensors.numpy import save_file as safe_save_file + from paddlenlp.utils.safetensors import fast_load_file as safe_load_file + from paddlenlp.utils.safetensors import fast_safe_open as safe_open + def prune_linear_layer(layer: nn.Linear, index: paddle.Tensor, dim: int = 0) -> nn.Linear: """ @@ -313,7 +316,7 @@ def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype: def load_state_dict( - checkpoint_file: Union[str, os.PathLike], tensor_parallel_split_mapping=None, fliter_dict_keys=None + checkpoint_file: Union[str, os.PathLike], tensor_parallel_split_mapping=None, fliter_dict_keys=None, device="cpu" ): """ Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise. @@ -346,11 +349,16 @@ def load_state_dict( weight = tensor_parallel_split_mapping[key](py_safe_slice_) else: weight = py_safe_slice_[:] + if device == "expected": + with device_guard(): + weight = paddle.Tensor(weight, zero_copy=True) + weight = weight._copy_to(paddle.framework._current_expected_place(), False) state_dict[key] = weight - for k in list(state_dict.keys()): - with device_guard(): - state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True) + if device == "cpu": + for k in list(state_dict.keys()): + with device_guard(): + state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True) return state_dict @@ -672,8 +680,10 @@ def load_sharded_checkpoint(model, folder, variant=None, strict=True, prefer_saf return missing_keys, unexpected_keys -def faster_set_state_dict(model, state_dict): +def faster_set_state_dict(model, state_dict, strict_dtype=True): # the state_dict will be destroied. + unused_keys = set(state_dict.keys()) + unset_keys = set(model.state_dict().keys()) with paddle.no_grad(): for k, v in model.state_dict().items(): if k in state_dict: @@ -683,8 +693,10 @@ def faster_set_state_dict(model, state_dict): f"faster_set_state_dict need state dict with paddle.Tensor, but got {type(v_new)}" ) # 2. cast param / Tensor to dtype + # if v.dtype != v_new.dtype: - raise ValueError(f"for key: {k}, expect dtype {v.dtype}, but got {v_new.dtype}") + if strict_dtype or (not v.is_floating_point() or not v_new.is_floating_point()): + raise ValueError(f"for key: {k}, expect dtype {v.dtype}, but got {v_new.dtype}") # check shape if list(v.shape) != list(v_new.shape): raise ValueError(f"for key: {k}, expect shape {v.shape}, but got {v_new.shape}") @@ -700,9 +712,22 @@ def faster_set_state_dict(model, state_dict): else: new_t = v_new + if not strict_dtype and v.dtype != new_t.dtype: + new_t = new_t.astype(v.dtype) + # 4. share Tensor to origin param / Tensor src_tensor = new_t.value().get_tensor() dst_tensor._share_data_with(src_tensor) + unset_keys.remove(k) + unused_keys.remove(k) + + error_msgs = [] + # if len(unset_keys) > 0: + # error_msgs.append(f"Those weight of model is not initialized: {list(unset_keys)}") + if len(unused_keys) > 0: + error_msgs.append(f"Those state dict keys are not using in model: {list(unused_keys)}") + + return error_msgs def _load_state_dict_into_model(model_to_load, state_dict, start_prefix): @@ -734,9 +759,8 @@ def _convert_state_dict_dtype_and_shape(state_dict, model_to_load): def is_0d_or_1d(tensor): return len(tensor.shape) == 0 or list(tensor.shape) == [1] - expected_place = paddle.framework._current_expected_place() for key, value in model_to_load.state_dict().items(): - if key in state_dict: + if key in list(state_dict.keys()): if isinstance(state_dict[key], np.ndarray): raise ValueError( "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, plase convert numpy.ndarray to paddle.Tensor" @@ -744,12 +768,7 @@ def is_0d_or_1d(tensor): # confirm parameter cast is executed on the same device as model # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it if state_dict[key].is_floating_point() and state_dict[key].dtype != value.dtype: - value_pop = state_dict.pop(key) - value_new_place = ( - value_pop if value_pop.place == expected_place else value_pop._copy_to(expected_place, False) - ) - state_dict[key] = paddle.cast(value_new_place, value.dtype)._copy_to(value_pop.place, False) - del value_new_place + state_dict[key] = paddle.cast(state_dict.pop(key), value.dtype) # unified 0d and 1d tensor if is_0d_or_1d(value) and is_0d_or_1d(state_dict[key]): if list(value.shape) != list(state_dict[key].shape): diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py new file mode 100644 index 000000000000..422a7d09961c --- /dev/null +++ b/paddlenlp/utils/safetensors.py @@ -0,0 +1,312 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import mmap +from collections import OrderedDict + +import numpy as np + +__all__ = [ + "fast_safe_open", + "fast_load_file", +] + + +MAX_HEADER_SIZE = 100 * 1000 * 1000 + +dtype_size = { + "BOOL": 1, + "U8": 1, + "I8": 1, + "F8_E5M2": 1, + "F8_E4M3": 1, + "I16": 2, + "U16": 2, + "I32": 4, + "U32": 4, + "I64": 8, + "U64": 8, + "F16": 2, + "BF16": 2, + "F32": 4, + "F64": 8, +} + +numpy_dtype = { + "BOOL": np.bool_, + "U8": np.uint8, + "I8": np.int8, + "F8_E5M2": 1, # no fp8 + "F8_E4M3": 1, # no fp8 + "I16": np.int16, + "U16": np.uint16, + "I32": np.int32, + "U32": np.uint32, + "I64": np.int64, + "U64": np.uint64, + "F16": np.float16, + "BF16": 2, # no bf16 + "F32": np.float32, + "F64": np.float64, +} + + +def getSize(fileobject): + fileobject.seek(0, 2) # move the cursor to the end of the file + size = fileobject.tell() + fileobject.seek(0) # move the cursor to the start of the file + return size + + +def metadata_validate(metadata): + start = 0 + for key, info in metadata.items(): + s, e = info["data_offsets"] + if s != start or e < s: + raise ValueError(f"SafeTensorError::InvalidOffset({key})") + start = e + nelements = np.prod(info["shape"]) + nbytes = nelements * dtype_size[info["dtype"]] + if (e - s) != nbytes: + raise ValueError("SafeTensorError::TensorInvalidInfo") + return start + + +def read_metadata(buffer): + buffer_len = getSize(buffer) + if buffer_len < 8: + raise ValueError("SafeTensorError::HeaderTooSmall") + + n = np.frombuffer(buffer.read(8), dtype=np.uint64).item() + if n > MAX_HEADER_SIZE: + raise ValueError("SafeTensorError::HeaderTooLarge") + + stop = n + 8 + if stop > buffer_len: + raise ValueError("SafeTensorError::InvalidHeaderLength") + + tensors = json.loads(buffer.read(n), object_pairs_hook=OrderedDict) + metadata = tensors.pop("__metadata__", None) + buffer_end = metadata_validate(tensors) + + if buffer_end + 8 + n != buffer_len: + raise ValueError("SafeTensorError::MetadataIncompleteBuffer") + + return stop, tensors, metadata + + +def readinto_numpy(meta, buffer, base_ptr): + def create_empty(info): + return np.empty(shape=info["shape"], dtype=numpy_dtype[info["dtype"]]) + + ret = {} + for k, v in meta.items(): + t = create_empty(v) + buffer.seek(base_ptr + v["data_offsets"][0]) + buffer.readinto(memoryview(t)) + ret[k] = t + return ret + + +class PySafeSlice: + def __init__(self, info, bufferfile, base_ptr, buffermmap): + self.info = info + self.bufferfile = bufferfile + self.buffermmap = buffermmap + self.base_ptr = base_ptr + + self.start = [0 for dim in self.shape] + self.stop = [dim for dim in self.shape] + self.step = [1 for dim in self.shape] + + @property + def ndim(self): + return len(self.shape) + + def __getitem__(self, index): + # https://github.com/numpy/numpy/blob/4d652465cea38e9504f954ac708d91e4954bd13a/numpy/lib/_arrayterator_impl.py#L96-L126 + # Fix index, handling ellipsis and incomplete slices. + if not isinstance(index, tuple): + index = (index,) + fixed = [] + length, dims = len(index), self.ndim + for slice_ in index: + if slice_ is Ellipsis: + fixed.extend([slice(None)] * (dims - length + 1)) + length = len(fixed) + elif isinstance(slice_, int): + fixed.append(slice(slice_, slice_ + 1, 1)) + else: + fixed.append(slice_) + index = tuple(fixed) + if len(index) < dims: + index += (slice(None),) * (dims - len(index)) + + out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step)) + for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)): + out_start[i] = slice_.start or 0 + out_step[i] = slice_.step or 1 + out_stop[i] = slice_.stop or stop - start + out_stop[i] = min(stop, out_stop[i]) + + target_shape = [] + for x, y, z in zip(out_start, out_stop, out_step): + assert z == 1, "only support step = 1" + if y - x > 1: + target_shape.append(int(y - x)) + + if len(target_shape) == 0: + if self.shape == [1]: + target_shape = self.shape + + # https://github.com/huggingface/safetensors/blob/b947b59079a6197d7930dfb535818ac4896113e8/safetensors/src/slice.rs#L297-L315 + indices = [] + span = self.bits + for i, (start, stop, step) in enumerate(zip(out_start[::-1], out_stop[::-1], out_step[::-1])): + if len(indices) == 0: + if start == 0 and stop == self.shape[i]: + pass + # We haven't started to slice yet, just increase the span + else: + offset = start * span + small_span = stop * span - offset + indices.append((offset, offset + small_span)) + + else: + capacity = (stop - start) * len(indices) + newindices = [] + for n in range(start, stop): + offset = n * span + for (old_start, old_stop) in indices: + newindices.append((old_start + offset, old_stop + offset)) + indices = newindices + assert len(indices) == capacity, f"error {capacity} {len(indices)}" + span *= self.shape[-(i + 1)] + + if len(indices) == 0: + indices.append((0, self.nbytes)) + + merge_indices = [] + last_end = -1 + last_start = -1 + for start, end in indices: + if start == last_end: + last_end = end + continue + else: + if last_start != -1: + merge_indices.append((last_start, last_end)) + last_start = start + last_end = end + if last_start != -1: + merge_indices.append((last_start, last_end)) + tensor = np.empty(shape=[1] if len(target_shape) == 0 else np.prod(target_shape), dtype=self.dtype) + + tensor_view = memoryview(tensor.view(np.uint8).reshape(-1)) + curr_data_ptr = 0 + # if to many slice and each slice < 1M + if len(merge_indices) > 128 and (merge_indices[0][1] - merge_indices[0][0] < 1024 * 1024): + # Use mmap for random access + for start, end in merge_indices: + data_len = end - start + tensor_view[curr_data_ptr : curr_data_ptr + data_len] = self.buffermmap[ + self.start_offset + start : self.start_offset + end + ] + curr_data_ptr += data_len + else: + # Use file read for sequence access + for start, end in merge_indices: + data_len = end - start + self.bufferfile.seek(self.start_offset + start) + view = tensor_view[curr_data_ptr : curr_data_ptr + data_len] + self.bufferfile.readinto(view) + curr_data_ptr += data_len + + return tensor.reshape(target_shape) + + def get(self, *args, **kwargs): + tensor = np.empty(shape=self.shape, dtype=self.dtype) + self.bufferfile.seek(self.start_offset) + self.bufferfile.readinto(memoryview(tensor)) + return tensor + + @property + def start_offset(self): + return self.base_ptr + self.info["data_offsets"][0] + + def get_shape(self): + return self.shape + + @property + def shape(self): + return self.info["shape"] + + @property + def dtype(self): + return numpy_dtype[self.info["dtype"]] + + @property + def nelements(self): + return np.prod(self.info["shape"]) + + @property + def bits(self): + return dtype_size[self.info["dtype"]] + + @property + def nbytes(self): + return self.nelements * dtype_size[self.info["dtype"]] + + +# a simple file writer object +class fast_safe_open: + def __init__(self, filename, framework=None, device="cpu"): + self.filename = filename + self.framework = framework + self.file = open(self.filename, "rb") + self.file_mmap = mmap.mmap(self.file.fileno(), 0, flags=mmap.MAP_PRIVATE) + self.base, self.tensors_decs, self.__metadata__ = read_metadata(self.file) + self.tensors = OrderedDict() + for key, info in self.tensors_decs.items(): + self.tensors[key] = PySafeSlice(info, self.file, self.base, self.file_mmap) + self.tensors[key].key = key + + def __enter__(self): + return self + + def __exit__(self, *args): + self.file_mmap.close() + self.file.close() + + def metadata(self): + return self.__metadata__ + + def keys(self): + return list(self.tensors.keys()) + + def get_tensor(self, name): + return self.tensors[name].get() + + def get_slice(self, name): + return self.tensors[name] + + +def fast_load_file(filename): + result = {} + with fast_safe_open(filename, framework="np") as f: + for k in f.keys(): + result[k] = f.get_tensor(k) + return result diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index 9b91905841af..f8cc0ed7bfac 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -48,6 +48,7 @@ "Flags_skip_mp_c_identity": "1", "FLAGS_shard_norm_align_dp": "0", "FLAGS_shard_use_reduce": "1", + "FLAGS_eager_communication_connection": "1", # no lazy init comm group "test_ci_no_save_model": "1", } @@ -1137,3 +1138,42 @@ def runfrist(self, train_args): def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) + + +@pytest.mark.skipif(True, reason="Skip for None CE") +class TestUnifiedCheckpointOnN1C8SaveLoadSpeed(TestUnifiedCheckpointFull): + def setUp(self): + super().setUp() + for config_key in self.configs: + self.configs[config_key]["skip_profile_timer"] = 0 + self.configs[config_key]["unified_checkpoint"] = 1 + self.configs[config_key]["save_steps"] = 6 + self.configs[config_key]["unified_checkpoint_config"] = "skip_save_model_weight master_weight_compatible" + + self.need_allclose = False + self.rtol = 1e-7 + + def runfrist(self, train_args): + self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args) + + def rerun(self, train_args): + self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args) + + +@pytest.mark.skipif(True, reason="Skip for None CE") +class TestPaddleCheckpointOnN1C8SaveLoadSpeed(TestUnifiedCheckpointFull): + def setUp(self): + super().setUp() + for config_key in self.configs: + self.configs[config_key]["skip_profile_timer"] = 0 + self.configs[config_key]["unified_checkpoint"] = 0 + self.configs[config_key]["save_steps"] = 6 + + self.need_allclose = False + self.rtol = 1e-7 + + def runfrist(self, train_args): + self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args) + + def rerun(self, train_args): + self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args) diff --git a/tests/transformers/clap/test_feature_extraction.py b/tests/transformers/clap/test_feature_extraction.py index 413f69276e0d..d78e476d14b5 100644 --- a/tests/transformers/clap/test_feature_extraction.py +++ b/tests/transformers/clap/test_feature_extraction.py @@ -68,6 +68,7 @@ def __init__( self.feature_size = feature_size self.chunk_length = chunk_length self.hop_length = hop_length + super().__init__() def prepare_feat_extract_dict(self): return { diff --git a/tests/transformers/ernie_vil/test_image_processing.py b/tests/transformers/ernie_vil/test_image_processing.py index 0f224ec951b7..d95217505902 100644 --- a/tests/transformers/ernie_vil/test_image_processing.py +++ b/tests/transformers/ernie_vil/test_image_processing.py @@ -58,6 +58,7 @@ def __init__( self.image_mean = image_mean self.image_std = image_std self.do_convert_rgb = do_convert_rgb + super().__init__() def prepare_image_processor_dict(self): return { diff --git a/tests/transformers/speecht5/test_feature_extraction.py b/tests/transformers/speecht5/test_feature_extraction.py index 067108b9c948..b2f63b87a972 100644 --- a/tests/transformers/speecht5/test_feature_extraction.py +++ b/tests/transformers/speecht5/test_feature_extraction.py @@ -81,6 +81,7 @@ def __init__( self.fmax = fmax self.mel_floor = mel_floor self.return_attention_mask = return_attention_mask + super().__init__() def prepare_feat_extract_dict(self): return { diff --git a/tests/transformers/test_safetensors.py b/tests/transformers/test_safetensors.py new file mode 100644 index 000000000000..3c143e26a0b5 --- /dev/null +++ b/tests/transformers/test_safetensors.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import numpy as np + +# from safetensors import safe_open +from safetensors.numpy import load_file, save_file + +from paddlenlp.utils.safetensors import fast_load_file, fast_safe_open + + +class FastSafetensors(unittest.TestCase): + def setUp(self): + super().setUp() + self.weigth_map = {} + tensors = [([10, 10], "float32"), ([8], "float16"), ([5, 5, 5], "int32")] + count = 0 + for shape, dtype in tensors: + self.weigth_map[f"weight_{count}"] = (np.random.random(shape) * 100).astype(dtype) + count += 1 + print(self.weigth_map) + + def test_load_file(self): + with tempfile.TemporaryDirectory() as tmpdirname: + path = os.path.join(tmpdirname, "test.safetensors") + save_file(self.weigth_map, path, metadata={"format": "np"}) + sf_load = load_file(path) + fs_sf_load = fast_load_file(path) + for k, v in self.weigth_map.items(): + np.testing.assert_equal(v, sf_load[k]) + np.testing.assert_equal(v, fs_sf_load[k]) + + def test_safe_open(self): + with tempfile.TemporaryDirectory() as tmpdirname: + path = os.path.join(tmpdirname, "test.safetensors") + save_file(self.weigth_map, path, metadata={"format": "np"}) + + with fast_safe_open(path, framework="np") as f: + for key in f.keys(): + safe_slice = f.get_slice(key) + np.testing.assert_equal(self.weigth_map[key][:2, ...], safe_slice[:2, ...]) + np.testing.assert_equal(self.weigth_map[key][..., :4], safe_slice[..., :4]) From 7a24bccfd15348e818036c56335fccf984fd95d5 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 21 May 2024 11:24:11 +0800 Subject: [PATCH 17/27] Cherry pick type promotion fix. (#8463) --- paddlenlp/generation/utils.py | 4 +++- paddlenlp/layers/crf.py | 2 +- paddlenlp/metrics/perplexity.py | 2 +- paddlenlp/prompt/verbalizer.py | 2 +- paddlenlp/transformers/convbert/modeling.py | 4 +++- paddlenlp/transformers/electra/modeling.py | 8 ++++++-- paddlenlp/transformers/funnel/modeling.py | 4 ++-- paddlenlp/transformers/gptj/modeling.py | 2 +- paddlenlp/transformers/mbart/modeling.py | 2 +- paddlenlp/transformers/megatronbert/modeling.py | 2 +- paddlenlp/transformers/prophetnet/modeling.py | 16 ++++++---------- paddlenlp/transformers/rembert/modeling.py | 2 +- 12 files changed, 27 insertions(+), 23 deletions(-) diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py index 625b81d765ff..f5abb5e25604 100644 --- a/paddlenlp/generation/utils.py +++ b/paddlenlp/generation/utils.py @@ -511,7 +511,9 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder def update_scores_for_generation(scores, next_scores, length, unfinished_flag): # update scores - unfinished_scores = (scores * length + next_scores) / (length + 1) + unfinished_scores = (scores * paddle.to_tensor(length, dtype=scores.dtype) + next_scores) / ( + paddle.to_tensor(length, dtype=scores.dtype) + 1 + ) scores = paddle.where(unfinished_flag, unfinished_scores, scores) return scores diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py index aaaec528ca5f..fb562653426f 100644 --- a/paddlenlp/layers/crf.py +++ b/paddlenlp/layers/crf.py @@ -165,7 +165,7 @@ def _point_score(self, inputs, labels, lengths): flattened_inputs = inputs.reshape([-1]) offsets = paddle.unsqueeze(self._get_batch_index(batch_size) * seq_len * n_labels, 1) offsets += paddle.unsqueeze(self._get_seq_index(seq_len) * n_labels, 0) - flattened_tag_indices = paddle.reshape(offsets + labels, [-1]) + flattened_tag_indices = paddle.reshape(offsets + labels.astype(offsets.dtype), [-1]) scores = paddle.gather(flattened_inputs, flattened_tag_indices).reshape([batch_size, seq_len]) diff --git a/paddlenlp/metrics/perplexity.py b/paddlenlp/metrics/perplexity.py index 905518f36db9..a785d3780561 100644 --- a/paddlenlp/metrics/perplexity.py +++ b/paddlenlp/metrics/perplexity.py @@ -92,7 +92,7 @@ def compute(self, pred, label, seq_mask=None): ce = F.cross_entropy(input=pred, label=label, reduction="none", soft_label=False) ce = paddle.squeeze(ce, axis=[2]) if seq_mask is not None: - ce = ce * seq_mask + ce = ce * seq_mask.astype(ce.dtype) word_num = paddle.sum(seq_mask) return ce, word_num return ce diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py index 637a37001559..174a863808b6 100644 --- a/paddlenlp/prompt/verbalizer.py +++ b/paddlenlp/prompt/verbalizer.py @@ -162,7 +162,7 @@ def aggregate(self, outputs: Tensor, mask: Tensor, atype: str): Aggregate multiple tokens/words for each word/label. """ if atype == "mean": - outputs = outputs * mask + outputs = outputs * mask.astype(outputs.dtype) outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15) elif atype == "max": outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1) diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py index d5ec8e843c2a..c9884e5a7383 100644 --- a/paddlenlp/transformers/convbert/modeling.py +++ b/paddlenlp/transformers/convbert/modeling.py @@ -1137,7 +1137,9 @@ def update_inputs(self, sequence, updates, positions): N = positions.shape[1] assert N == L, "the dimension of inputs and mask should be same as [batch_size, sequence_length]" - updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates) + updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + ( + positions * updates.astype(positions.dtype) + ) return updated_sequence diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py index b3b0b67c1a3d..03412fd7e39e 100644 --- a/paddlenlp/transformers/electra/modeling.py +++ b/paddlenlp/transformers/electra/modeling.py @@ -1051,7 +1051,9 @@ def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generat mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions) updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions) # use inputs and updated_input to get discriminator labels - labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype("int64")) + labels = mask_positions * ( + paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype(raw_inputs.dtype) + ) return updated_inputs, labels, sampled_tokids def sample_from_softmax(self, logits, use_softmax_sample=True): @@ -1073,7 +1075,9 @@ def update_inputs(self, sequence, updates, positions): N = positions.shape[1] assert N == L, "the dimension of inputs and mask should be same as [B, L]" - updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates) + updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + ( + positions * updates.astype(positions.dtype) + ) return updated_sequence diff --git a/paddlenlp/transformers/funnel/modeling.py b/paddlenlp/transformers/funnel/modeling.py index 5952363a44b1..7dc097ef68e0 100644 --- a/paddlenlp/transformers/funnel/modeling.py +++ b/paddlenlp/transformers/funnel/modeling.py @@ -519,7 +519,7 @@ def relative_positional_attention(self, position_embeds, q_head, context_len, cl positional_attn = _relative_shift_gather(positional_attn, context_len, shift) if cls_mask is not None: - positional_attn *= cls_mask + positional_attn *= cls_mask.astype(positional_attn.dtype) return positional_attn def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None): @@ -547,7 +547,7 @@ def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None): ) if cls_mask is not None: - token_type_attn *= cls_mask + token_type_attn *= cls_mask.astype(token_type_attn.dtype) return token_type_attn def forward(self, query, key, value, attention_inputs, output_attentions=False): diff --git a/paddlenlp/transformers/gptj/modeling.py b/paddlenlp/transformers/gptj/modeling.py index 86207866a5dd..df8ea5e7f1e2 100644 --- a/paddlenlp/transformers/gptj/modeling.py +++ b/paddlenlp/transformers/gptj/modeling.py @@ -158,7 +158,7 @@ def _attn( if attention_mask is not None: # Apply the attention mask - attn_weights = attn_weights + attention_mask + attn_weights = attn_weights + attention_mask.astype(attn_weights.dtype) attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1) attn_weights = attn_weights.astype(value.dtype) diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py index d401554fde3d..28c4d577ebd7 100644 --- a/paddlenlp/transformers/mbart/modeling.py +++ b/paddlenlp/transformers/mbart/modeling.py @@ -63,7 +63,7 @@ def shift_tokens_right(input_ids, pad_token_id): batch_size, seq_length = paddle.shape(shifted_input_ids) index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1 - decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos) + decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos.astype(index.dtype)) shifted_input_ids[:, 1:] = shifted_input_ids[:, :-1].clone() shifted_input_ids[:, 0] = decoder_start_tokens return shifted_input_ids diff --git a/paddlenlp/transformers/megatronbert/modeling.py b/paddlenlp/transformers/megatronbert/modeling.py index 6536080cd982..85f002d84b2a 100644 --- a/paddlenlp/transformers/megatronbert/modeling.py +++ b/paddlenlp/transformers/megatronbert/modeling.py @@ -171,7 +171,7 @@ def forward(self, hidden_states, attention_mask=None): attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function) - attention_scores = attention_scores + attention_mask + attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, axis=-1) diff --git a/paddlenlp/transformers/prophetnet/modeling.py b/paddlenlp/transformers/prophetnet/modeling.py index 9c251078f8c4..0baf3a7b36c5 100644 --- a/paddlenlp/transformers/prophetnet/modeling.py +++ b/paddlenlp/transformers/prophetnet/modeling.py @@ -71,12 +71,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b ) inv_relative_positions = paddle.abs(inv_relative_positions) else: - inv_relative_positions = ( - paddle.cast( - paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32 - ) - * inv_relative_positions - ) + inv_relative_positions = paddle.cast( + paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32 + ) * inv_relative_positions.astype(paddle.int32) max_exact = num_buckets // 2 is_small = paddle.less_than(inv_relative_positions, paddle.to_tensor(max_exact).cast(dtype=paddle.int32)) @@ -85,10 +82,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b ) / math.log(max_distance / max_exact) * (num_buckets - max_exact) val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1) val_if_large_lt = paddle.cast(paddle.less_than(val_if_large, val_if_large_num_buckets), dtype=paddle.int32) - val_if_large = ( - paddle.cast(val_if_large_lt * val_if_large, dtype=paddle.int32) - + (1 - val_if_large_lt) * val_if_large_num_buckets - ) + val_if_large = val_if_large_lt * val_if_large.astype(val_if_large_lt.dtype) + ( + 1 - val_if_large_lt + ) * val_if_large_num_buckets.astype(val_if_large_lt.dtype) rel_positions_bucket = rel_positions_bucket + paddle.where( is_small, paddle.cast(inv_relative_positions, dtype=paddle.int32), val_if_large ) diff --git a/paddlenlp/transformers/rembert/modeling.py b/paddlenlp/transformers/rembert/modeling.py index 7fa30229e316..c4697253e7ff 100644 --- a/paddlenlp/transformers/rembert/modeling.py +++ b/paddlenlp/transformers/rembert/modeling.py @@ -150,7 +150,7 @@ def forward(self, hidden_states, attention_mask=None): attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in RemBertModel forward() function) - attention_scores = attention_scores + attention_mask + attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype) # Normalize the attention scores to probabilities. attention_probs = F.softmax(attention_scores, axis=-1) From 8879f79f9857dc7831403064631ae32b0a0def23 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Thu, 23 May 2024 21:09:14 +0800 Subject: [PATCH 18/27] quick fix from pretrained. (#8487) --- paddlenlp/transformers/model_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index dc1c753206c4..9c9af9bbc694 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -798,7 +798,7 @@ def _load_state_dict_into_meta_model( dtype = convert_np_dtype_to_dtype_(dtype) error_msgs = [] - + model_state_dict = model.state_dict() for param_name, param in state_dict.items(): # First part of the test is always true as loaded_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: @@ -833,7 +833,7 @@ def _load_state_dict_into_meta_model( if old_param is not None: param = param.astype(dtype=old_param.dtype) with paddle.no_grad(): - model.state_dict()[param_name].get_tensor()._share_data_with(param.value().get_tensor()) + model_state_dict[param_name].get_tensor()._share_data_with(param.value().get_tensor()) param.value().get_tensor()._clear() return error_msgs @@ -1890,7 +1890,7 @@ def _find_mismatched_keys( if ( shard_file.endswith(".safetensors") and config.tensor_parallel_degree > 1 - and "tp" not in shard_file + and "tp" not in os.path.spilt(shard_file)[-1] ): pre_tensor_parallel_split = True assert loaded_keys is not None, "loaded_keys is not None." From bbf945b64ab611e491c429ef86887cf84f43d3a5 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Fri, 24 May 2024 13:43:46 +0800 Subject: [PATCH 19/27] Release/2.8 (#8437) * [XPU] llama add xpu support (#8282) * [XPU] llama add xpu support * fix * use try import * fix * refine * refine * refine * refine * update (#8399) * [LLM] Support fuse attention q, k, v weights (#8202) 1. add use-interface & fuse action 1.1. modify 1., code order 2. switch to name_mapping 3. solve tp branch 3.2 follow hui, handel qkv separately 3.3 handle pdparams 3.4 from torch 3.5 abandon low_cpu_mem_usage 3.6 solve shard branch * 3.6.1 solve shard branch after rebase develop * code clean * remove debug comment * Redefine fuse and split functions * Redefine fuse and split functions * comment and fix * update method * update QKV fuse and split * support fuse weights in multi-files * add precision compare * simplify function call * support use_fast_ffn * clean modeling and configuration * add test for gpt and opt * fix tp_actions get * add fast_ffn test * add Qwen2Moe * Revert "add Qwen2Moe" This reverts commit 113b8838a7c53f1d131928c30bf1071dfa583445. * add test for split * update doc * update filter_dict_keys --------- Co-authored-by: Zii * [LLM] Fix fuse or split with same key (#8378) * fix fuse or split with same key * fix * fix eps * update format * [LLM] add decay steps option for finetuning (#8251) * [LLM] add memory stats to logger of trainer (#8269) * [Distributed] fix lora (#8325) * [LLM] fix lora target modules on llama (#8372) * [Distributed] metric calculation supports tp logits (#8370) * Update model_utils.py * Update model_utils.py * Update model_utils.py --------- Co-authored-by: Jianbang Yang Co-authored-by: DrownFish19 Co-authored-by: Zii Co-authored-by: Tian <121000916+SylarTiaNII@users.noreply.github.com> --- llm/finetune_generation.py | 6 +- llm/run_pretrain.py | 11 + llm/utils.py | 9 + paddlenlp/peft/lora/lora_layers.py | 2 +- paddlenlp/trainer/trainer.py | 18 +- paddlenlp/trainer/training_args.py | 4 + paddlenlp/transformers/conversion_utils.py | 247 ++++++++++++++++++ paddlenlp/transformers/gpt/modeling.py | 43 ++++ paddlenlp/transformers/gpt/modeling_pp.py | 1 + paddlenlp/transformers/linear_utils.py | 59 +++++ paddlenlp/transformers/llama/modeling.py | 155 ++++++++--- paddlenlp/transformers/llama/modeling_pp.py | 1 + paddlenlp/transformers/model_utils.py | 71 ++++- paddlenlp/transformers/opt/configuration.py | 5 + paddlenlp/transformers/opt/modeling.py | 43 ++++ tests/transformers/test_conversion_common.py | 258 +++++++++++++++++++ 16 files changed, 886 insertions(+), 47 deletions(-) create mode 100644 paddlenlp/transformers/linear_utils.py create mode 100644 tests/transformers/test_conversion_common.py diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py index df7a22a0cb95..c8fed17165af 100644 --- a/llm/finetune_generation.py +++ b/llm/finetune_generation.py @@ -140,7 +140,7 @@ def main(): if not training_args.autotuner_benchmark: model = AutoModelForCausalLMPipe.from_pretrained( model_args.model_name_or_path, - tensor_parallel_output=False, + tensor_parallel_output=training_args.tensor_parallel_output, tensor_parallel_degree=training_args.tensor_parallel_degree, tensor_parallel_rank=training_args.tensor_parallel_rank, use_flash_attention=model_args.use_flash_attention, @@ -152,7 +152,7 @@ def main(): # NOTE(gongenlei): new add autotuner_benchmark model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, - tensor_parallel_output=False, + tensor_parallel_output=training_args.tensor_parallel_output, tensor_parallel_degree=training_args.tensor_parallel_degree, tensor_parallel_rank=training_args.tensor_parallel_rank, dtype=dtype, @@ -163,7 +163,7 @@ def main(): else: model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, - tensor_parallel_output=False, + tensor_parallel_output=training_args.tensor_parallel_output, tensor_parallel_degree=training_args.tensor_parallel_degree, tensor_parallel_rank=training_args.tensor_parallel_rank, dtype=dtype, diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py index d0df32321e18..7196f52eea6d 100644 --- a/llm/run_pretrain.py +++ b/llm/run_pretrain.py @@ -46,6 +46,7 @@ ) from paddlenlp.utils.batch_sampler import DistributedBatchSampler from paddlenlp.utils.log import logger +from paddlenlp.utils.tools import get_env_device def add_start_docstrings(*docstr): @@ -483,6 +484,16 @@ def main(): config.num_attention_heads % config.sep_parallel_degree == 0 ), f"num_attention_heads:{config.num_attention_heads} must be divisible by sep_parallel_degree {config.sep_parallel_degree}" + if get_env_device() == "xpu" and training_args.gradient_accumulation_steps > 1: + try: + from paddle_xpu.layers.nn.linear import LinearConfig # noqa: F401 + + LinearConfig.enable_accumulate_steps_opt() + LinearConfig.set_accumulate_steps(training_args.gradient_accumulation_steps) + except ImportError: + # It's OK, not use accumulate_steps optimization + pass + print("Final pre-training config:", config) # Set the dtype for loading model diff --git a/llm/utils.py b/llm/utils.py index 8bcc52ae33ab..6688357bd67b 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -125,9 +125,11 @@ def get_lora_target_modules(model): ".*v_proj.*", ".*k_proj.*", ".*o_proj.*", + ".*qkv_proj.*", ".*gate_proj.*", ".*down_proj.*", ".*up_proj.*", + ".*gate_up_fused_proj.*", ] elif model.base_model_prefix == "opt": target_modules = [ @@ -209,6 +211,13 @@ def prediction_step( # keepdim in order to maintain the same shape as logits if isinstance(logits, (list, tuple)): logits = logits[0] + # all gather logits when enabling tensor_parallel_output + if self.args.tensor_parallel_degree > 1 and self.args.tensor_parallel_output: + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + gathered_logits = [] + dist.all_gather(gathered_logits, logits, group=model_parallel_group) + logits = paddle.concat(gathered_logits, axis=-1) return (loss, logits.argmax(axis=-1, keepdim=True), labels) loss = None diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py index 7ac40ed0ba66..73120060fe87 100644 --- a/paddlenlp/peft/lora/lora_layers.py +++ b/paddlenlp/peft/lora/lora_layers.py @@ -539,7 +539,7 @@ def forward(self, input: paddle.Tensor): result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name) else: res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group) - result_mp = res_mp + self.bias + result_mp = (res_mp + self.bias) if self.bias is not None else res_mp if not self.merged: input_a = self.lora_dropout(input) @ self.lora_A diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 746b7e252516..f507b5c8b92f 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -39,6 +39,8 @@ import paddle.distributed as dist import paddle.nn as nn from packaging import version +from paddle import framework +from paddle.base import core from paddle.distributed import fleet from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import ( HybridParallelOptimizer, @@ -1257,6 +1259,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate())) logs["global_step"] = int(self.state.global_step) + divisor = 2**30 + # TODO(@gexiao): replace these codes with unified APIs in Paddle + current_device = framework._current_expected_place_() + if str(current_device) != "Place(cpu)": + device_id = current_device.get_device_id() + current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) + current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) + max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) + max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) + logs["current_memory_allocated"] = current_memory_allocated / divisor + logs["current_memory_reserved"] = current_memory_reserved / divisor + logs["max_memory_allocated"] = max_memory_allocated / divisor + logs["max_memory_reserved"] = max_memory_reserved / divisor + total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size ) @@ -1614,8 +1630,6 @@ def _load_rng_state(self, checkpoint): random.setstate(checkpoint_rng_state["python"]) np.random.set_state(checkpoint_rng_state["numpy"]) - core = paddle.framework.core - core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"]) if core.is_compiled_with_cuda(): if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count(): diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 2ed9d343ceaa..3118178608d2 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -787,6 +787,10 @@ class TrainingArguments: default=False, metadata={"help": "whether to run distributed training in auto parallel mode"}, ) + tensor_parallel_output: Optional[bool] = field( + default=False, + metadata={"help": "whether to output logits in distributed status"}, + ) def __post_init__(self): env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1)) diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py index ba5169454d0b..6ea6afaad80a 100644 --- a/paddlenlp/transformers/conversion_utils.py +++ b/paddlenlp/transformers/conversion_utils.py @@ -499,6 +499,118 @@ def splited_qkv_to_tensor_parallel_qkv(weight_list, num_attention_heads): return naive_merged_qkv_to_tensor_parallel_qkv(weight) +def fuse_param_func(): + def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None): + """fuse function for fusing weights + + (1) fuse_attention_qkv + q => [q1,q2,q3,q4] + k => [k1,k2,k3,k4] or [k1,k2] for GQA + v => [v1,v2,v3,v4] or [v1,v2] for GQA + fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4] + or for GQA [q1,q2,k1,v1,q3,q4,k2,v2] + (2) fuse_attention_ffn + directly fuse weights to 1 parts + [gate_weight], [up_weight] => [gate_weight, up_weight] + + Args: + fuse_params (_type_): to be fused weights + is_qkv (bool, optional): for attention qkv weights. Defaults to False. + num_heads (_type_, optional): query heads. Defaults to None. + num_key_value_heads (_type_, optional): key and value heads. Defaults to None. + + Returns: + _type_: fused weights + """ + concat_fn = np.concatenate + split_fn = np.split + if isinstance(fuse_params[0], paddle.Tensor): + concat_fn = paddle.concat + split_fn = paddle.split + + if is_qkv: + # fuse_attention_qkv + assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}" + assert ( + num_key_value_heads + ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}" + assert ( + len(fuse_params) == 3 + ), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}" + num_query_groups = num_heads // num_key_value_heads + q_list = split_fn(fuse_params[0], num_heads, axis=-1) + k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1) + v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1) + + qkv_pairs = [] + for i in range(num_key_value_heads): + qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups] + qkv_pairs.append(k_list[i]) + qkv_pairs.append(v_list[i]) + return concat_fn(qkv_pairs, axis=-1) + else: + # fuse_attention_ffn + return concat_fn(fuse_params, axis=-1) + + return fn + + +def split_param_func(): + def fn(fused_param, split_nums=2, is_qkv=False, num_heads=None, num_key_value_heads=None): + """split function for splitting weights + + (1) fuse_attention_qkv + fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4] + or for GQA [q1,q2,k1,v1,q3,q4,k2,v2] + after split + q => [q1,q2,q3,q4] + k => [k1,k2,k3,k4] or [k1,k2] for GQA + v => [v1,v2,v3,v4] or [v1,v2] for GQA + (2) fuse_attention_ffn + directly split weight to 2 parts + [gate_weight, up_weight] => [gate_weight], [up_weight] + + Args: + fused_param (_type_): len(fused_param)=1, only one weight to be splitted + split_nums (int, optional): split_nums. Defaults to 2. + is_qkv (bool, optional): for attention qkv weights. Defaults to False. + num_heads (_type_, optional): query heads. Defaults to None. + num_key_value_heads (_type_, optional): key and value heads. Defaults to None. + + Returns: + _type_: splitted weights + """ + concat_fn = np.concatenate + split_fn = np.split + if isinstance(fused_param, paddle.Tensor): + concat_fn = paddle.concat + split_fn = paddle.split + + if is_qkv: + # fuse_attention_qkv + assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}" + assert ( + num_key_value_heads + ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}" + num_query_groups = num_heads // num_key_value_heads + q_list, k_list, v_list = [], [], [] + split_heads = split_fn(fused_param, num_heads + 2 * num_key_value_heads, axis=-1) + for i in range(num_key_value_heads): + q_list += split_heads[i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2] + k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2]) + v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1]) + return concat_fn(q_list, axis=-1), concat_fn(k_list, axis=-1), concat_fn(v_list, axis=-1) + else: + # fuse_attention_ffn + return split_fn(fused_param, split_nums, axis=-1) + + return fn + + +def split_or_fuse_func(is_fuse=True): + return fuse_param_func() if is_fuse else split_param_func() + + def get_tensor_parallel_merge_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads=None): def fn( x, @@ -1110,6 +1222,7 @@ def convert_tensor_parallel( weight_file (str | None): the weight file path of `model_state.pdparams` file config (PretrainedConfig): the PretrainedConfig instance of model """ + name_action_mappings = cls._get_tensor_parallel_mappings(config) if state_dict is None: with device_guard("cpu"): @@ -1211,6 +1324,140 @@ def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_error=False): return state_keys_map + @classmethod + def convert_fuse_and_split(cls, config: PretrainedConfig, state_dict, tp_actions=None): + loaded_keys = state_dict.keys() + # collect and convert fuse/split action + fused_and_split_keys = [] + convert_with_same_keys = [] + fuse_actions, resume_keys = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True) + for keys, action in fuse_actions.items(): + if keys[-1] in keys[:-1]: + assert len(keys) == 2, "only 2 keys can be converted with the same name" + convert_with_same_keys.append(keys[-1]) + origin_states = [state_dict.pop(key) for key in keys[:-1]] + state_dict[keys[-1]] = action(origin_states) + fused_and_split_keys.append(keys[-1]) + logger.debug(f"Fusing parameter: {keys[:-1]} into {keys[-1]}") + + split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False) + for keys, action in split_actions.items(): + if keys[-1] in keys[:-1]: + assert len(keys) == 2, "only 2 keys can be converted with the same name" + convert_with_same_keys.append(keys[-1]) + origin_state = state_dict.pop(keys[-1]) + split_states = action(origin_state) + for key_idx, key in enumerate(keys[:-1]): + state_dict[key] = split_states[key_idx] + fused_and_split_keys.append(key) + logger.debug(f"Splitting parameter: {keys[-1]} into {keys[:-1]}") + + if tp_actions is not None: + for key in fused_and_split_keys: + if key in convert_with_same_keys: + continue + + for name in tp_actions.keys(): + if key.endswith(name): + with device_guard(): + state_dict[key] = paddle.Tensor(tp_actions[name](state_dict.pop(key)), zero_copy=True) + break + + # when shard file split the weight as follows, some weights need to be resumed for next shard file + # shard-001-file: q_weight, k_weight + # shard_002-file: v_weight + resume_state_dict = {k: state_dict[k] for k in resume_keys if k in state_dict} + return state_dict, resume_state_dict + + @classmethod + def get_fuse_or_split_param_convert_actions( + cls, + config: PretrainedConfig, + loaded_state_dict_keys, + is_fuse=True, + ignore_error=False, + ): + name_action_mappings = cls._get_fuse_or_split_param_mappings(config, is_fuse) + state_keys_map = cls._resolve_prefix_keys_for_fuse_and_split( + name_action_mappings.keys(), loaded_state_dict_keys, ignore_error, is_fuse + ) + for k, v in state_keys_map.items(): + name_action_mappings[v] = name_action_mappings.pop(k) + + # filter name_action_mappings with corresponding weights + # fusing: verify all of the keys in name_action_mappings are in loaded_state_dict_keys + # splitting: verify the last key in name_action_mappings is in loaded_state_dict_keys + filter_name_action = {} + resume_keys = [] + if is_fuse: + for k, v in name_action_mappings.items(): + cond = True + if not all(item in loaded_state_dict_keys for item in k[:-1]): + # resume keys for next fuse + resume_keys += k[:-1] + cond = False + if cond: + filter_name_action[k] = v + else: + for k, v in name_action_mappings.items(): + if k[-1] in loaded_state_dict_keys: + filter_name_action[k] = v + + return filter_name_action, resume_keys + + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: PretrainedConfig, is_fuse=True) -> List[StateDictNameMapping]: + """get fused parameter mapping of PretrainedModel + + Args: + config (PretrainedConfig): the configuration of name-mapping + + Raises: + NotImplementedError: + + Returns: + List[StateDictNameMapping]: the name-mappings for tensor_parallel + """ + # raise NotImplementedError( + # f"`_get_fuse_or_split_param_mappings` is not implemented for {cls.__name__}`. To implement it, you should " + # f"overwrite this method in the class {cls.__name__} in `{cls.__module__}.py`" + # ) + return {} + + @staticmethod + def _resolve_prefix_keys_for_fuse_and_split(state_keys_base, state_keys_real, ignore_error=False, is_fuse=True): + state_keys_map = {} + + # use the tuple (x1,x2,x3,x4) as one key, and the prefix of x1,x2,x3 is used as a new key x4 or + # the last key x4 is used as new keys x1,x2,x3. And, the tuple also could be (a) (x1, x1) -> convert x1 to x1; + # (b) (x1,x2,x3) -> fuse x1 and x2 to x3; (c) (x1,x2,x3,x4) -> fuse x1, x2 and x3 to x4. + + # is_fuse: True -> fuse, False -> split + # True: (x1,x2,x3,x4) -> [x1,x2,x3] are exist in state_keys_real, x4 is not exist in state_keys_real + # False: (x1,x2,x3,x4) -> [x1,x2,x3] are not exist in state_keys_real, x4 is exist in state_keys_real + + for keys in state_keys_base: + prefix = "" + if is_fuse: + for x in state_keys_real: + for base_key in keys[:-1]: + if x.endswith(base_key): + prefix = x.replace(base_key, "") + break + if prefix != "": + break + else: + base_key = keys[-1] + for x in state_keys_real: + if x.endswith(base_key): + prefix = x.replace(base_key, "") + break + + new_keys = tuple([prefix + key for key in keys]) + state_keys_map[keys] = new_keys + + return state_keys_map + class Converter(ConversionMixin, LogitComparer): """some converters are implemented in ppdiffusers, so if remove it directly, it will make ppdiffusers down. diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 8c066431979f..4a018347f80c 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -844,6 +844,49 @@ def get_tensor_parallel_split_mappings(num_layers): return mappings + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: GPTConfig, is_fuse=False): + # return parameter fuse utils + from paddlenlp.transformers.conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = ( + "decoder.layers.0.self_attn.q_proj.weight", + "decoder.layers.0.self_attn.k_proj.weight", + "decoder.layers.0.self_attn.v_proj.weight", + "decoder.layers.0.self_attn.qkv_proj.weight", + ) + fuse_qkv_bias_keys = ( + "decoder.layers.0.self_attn.q_proj.bias", + "decoder.layers.0.self_attn.k_proj.bias", + "decoder.layers.0.self_attn.v_proj.bias", + "decoder.layers.0.self_attn.qkv_proj.bias", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]: + new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys]) + final_actions[new_keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]: + new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys]) + final_actions[new_keys] = partial( + fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + return final_actions + @classmethod def _get_name_mappings(cls, config: GPTConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py index cd3dce018378..8b350e6556df 100644 --- a/paddlenlp/transformers/gpt/modeling_pp.py +++ b/paddlenlp/transformers/gpt/modeling_pp.py @@ -161,6 +161,7 @@ class GPTForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): config_class = GPTConfig _get_tensor_parallel_mappings = GPTPretrainedModel._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = GPTPretrainedModel._get_fuse_or_split_param_mappings _init_weights = GPTPretrainedModel._init_weights pretrained_init_configuration = GPTPretrainedModel.pretrained_init_configuration diff --git a/paddlenlp/transformers/linear_utils.py b/paddlenlp/transformers/linear_utils.py new file mode 100644 index 000000000000..de1a0f886b79 --- /dev/null +++ b/paddlenlp/transformers/linear_utils.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file is used for replacing Paddle's native Linear implementations with vendors' customized implementations +""" + +import paddle.distributed.fleet.meta_parallel as mpu +from paddle import nn +from paddle.distributed.fleet.utils import sequence_parallel_utils + +from paddlenlp.transformers.mc2_parallel_linear import ( + MC2ColumnSeqParallelLinear, + MC2RowSeqParallelLinear, +) +from paddlenlp.utils.tools import get_env_device + +Linear = nn.Linear +ColumnParallelLinear = mpu.ColumnParallelLinear +RowParallelLinear = mpu.RowParallelLinear +ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear +RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear + +if get_env_device() == "npu": + if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: + ColumnSequenceParallelLinear = MC2ColumnSeqParallelLinear + RowSequenceParallelLinear = MC2RowSeqParallelLinear +elif get_env_device() == "xpu": + try: + from paddle_xpu.layers.nn import ColumnParallelLinear as XPUColumnParallelLinear + from paddle_xpu.layers.nn import Linear as XPULinear + from paddle_xpu.layers.nn import RowParallelLinear as XPURowParallelLinear + from paddle_xpu.layers.nn.sequence_parallel import ( + XPUColumnSequenceParallelLinear, + XPURowSequenceParallelLinear, + ) + + Linear = XPULinear + ColumnParallelLinear = XPUColumnParallelLinear + RowParallelLinear = XPURowParallelLinear + ColumnSequenceParallelLinear = XPUColumnSequenceParallelLinear + RowSequenceParallelLinear = XPURowSequenceParallelLinear + except ImportError: + # If paddle_xpu is not installed, just use Paddle's native Linear implementations + pass +else: + # By default, use Paddle's native Linear implementations + pass diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 38f1d244bdf2..97cf780e2447 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -62,10 +62,6 @@ def swiglu(x, y=None): init_name_mappings, ) from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies -from paddlenlp.transformers.mc2_parallel_linear import ( - MC2ColumnSeqParallelLinear, - MC2RowSeqParallelLinear, -) from paddlenlp.transformers.model_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, @@ -74,6 +70,8 @@ def swiglu(x, y=None): from paddlenlp.utils.log import logger from paddlenlp.utils.tools import get_env_device +from .. import linear_utils +from ..linear_utils import Linear from ..segment_parallel_utils import ReshardLayer from .configuration import ( LLAMA_PRETRAINED_INIT_CONFIGURATION, @@ -211,6 +209,7 @@ def scaled_dot_product_attention( alibi=None, sequence_parallel=False, reshard_layer=None, + npu_is_casual=False, ): bsz, q_len, num_heads, head_dim = query_states.shape _, kv_seq_len, _, _ = value_states.shape @@ -410,6 +409,15 @@ def forward(self, hidden_states): if self.config.use_fused_rms_norm: if get_env_device() == "npu": return core.eager._run_custom_op("rms_norm_npu", hidden_states, self.weight, self.variance_epsilon)[0] + elif get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + return paddle_xpu_nn.xpu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] + except ImportError: + raise NotImplementedError( + f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature" + ) return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) if paddle.in_dynamic_mode(): @@ -571,15 +579,11 @@ def __init__(self, config): self.fuse_attention_ffn = config.fuse_attention_ffn if config.sequence_parallel: - if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: - ColumnParallelLinear = MC2ColumnSeqParallelLinear - RowParallelLinear = MC2RowSeqParallelLinear - else: - ColumnParallelLinear = ColumnSequenceParallelLinear - RowParallelLinear = RowSequenceParallelLinear + ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear + RowParallelLinear = linear_utils.RowSequenceParallelLinear else: - ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear - RowParallelLinear = fleet.meta_parallel.RowParallelLinear + ColumnParallelLinear = linear_utils.ColumnParallelLinear + RowParallelLinear = linear_utils.RowParallelLinear if config.tensor_parallel_degree > 1: if config.fuse_attention_ffn: @@ -611,15 +615,29 @@ def __init__(self, config): ) else: if config.fuse_attention_ffn: - self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) + self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) else: - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) + self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False) def forward(self, x): if self.fuse_attention_ffn: + # FIXME(yangjianbang): use paddle's native swiglu + if get_env_device() == "xpu": + try: + import paddle_xpu_nn # noqa: F821 + + out = self.gate_up_fused_proj(x) + out = paddle_xpu_nn.xpu_swiglu(out, axis=-1, turn=True) + out = self.down_proj(out) + return out + except ImportError: + gate_out, up_out = paddle.chunk(self.gate_up_fused_proj(x), chunks=2, axis=-1) + out = self.down_proj(F.silu(gate_out) * up_out) + return out + x = swiglu(self.gate_up_fused_proj(x)) else: x = swiglu(self.gate_proj(x), self.up_proj(x)) @@ -680,7 +698,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): ) self.use_fused_rope = config.use_fused_rope - if self.use_fused_rope and get_env_device() != "npu": + if self.use_fused_rope and get_env_device() not in ["npu", "xpu"]: if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None: warnings.warn( "Enable fuse rope in the config, but fuse rope is not available. " @@ -689,15 +707,11 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): self.use_fused_rope = False if config.sequence_parallel: - if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: - ColumnParallelLinear = MC2ColumnSeqParallelLinear - RowParallelLinear = MC2RowSeqParallelLinear - else: - ColumnParallelLinear = ColumnSequenceParallelLinear - RowParallelLinear = RowSequenceParallelLinear + ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear + RowParallelLinear = linear_utils.RowSequenceParallelLinear else: - ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear - RowParallelLinear = fleet.meta_parallel.RowParallelLinear + ColumnParallelLinear = linear_utils.ColumnParallelLinear + RowParallelLinear = linear_utils.RowParallelLinear if config.tensor_parallel_degree > 1: if self.fuse_attention_qkv: @@ -728,12 +742,12 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): gather_output=False, ) else: - self.k_proj = nn.Linear( + self.k_proj = Linear( self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False, ) - self.v_proj = nn.Linear( + self.v_proj = Linear( self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False, @@ -741,23 +755,23 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): else: if self.fuse_attention_qkv: - self.qkv_proj = nn.Linear( + self.qkv_proj = Linear( self.hidden_size, self.hidden_size + 2 * self.config.num_key_value_heads * self.head_dim, bias_attr=False, ) else: - self.q_proj = nn.Linear( + self.q_proj = Linear( self.hidden_size, self.hidden_size, bias_attr=False, ) - self.k_proj = nn.Linear( + self.k_proj = Linear( self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False, ) - self.v_proj = nn.Linear( + self.v_proj = Linear( self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False, @@ -771,7 +785,7 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False): input_is_parallel=True, ) else: - self.o_proj = nn.Linear( + self.o_proj = Linear( self.hidden_size, self.hidden_size, bias_attr=False, @@ -835,6 +849,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) @@ -1062,6 +1077,7 @@ def forward( alibi, self.sequence_parallel, reshard_layer=self.reshard_layer, + npu_is_casual=npu_is_casual, ) if output_attentions: attn_output, attn_weights = outputs @@ -1114,6 +1130,7 @@ def forward( past_key_value: Optional[Tuple[paddle.Tensor]] = None, use_cache: Optional[bool] = False, alibi: Optional[paddle.Tensor] = None, + npu_is_casual: bool = False, ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: @@ -1161,6 +1178,7 @@ def forward( output_attentions, use_cache, alibi, + npu_is_casual=npu_is_casual, ) if type(outputs) is tuple: @@ -1293,6 +1311,56 @@ def get_tensor_parallel_split_mappings(num_layers): return mappings + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: LlamaConfig, is_fuse=False): + # return parameter fuse utils + from paddlenlp.transformers.conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = ( + "layers.0.self_attn.q_proj.weight", + "layers.0.self_attn.k_proj.weight", + "layers.0.self_attn.v_proj.weight", + "layers.0.self_attn.qkv_proj.weight", + ) + + fuse_gate_up_keys = ( + "layers.0.mlp.gate_proj.weight", + "layers.0.mlp.up_proj.weight", + "layers.0.mlp.gate_up_fused_proj.weight", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys]) + final_actions[keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = fn + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys]) + final_actions[keys] = partial( + fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + if not fuse_attention_ffn: + for i in range(config.num_hidden_layers): + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys]) + final_actions[keys] = partial(fn, split_nums=2) + return final_actions + def _init_weights(self, layer): """Initialization hook""" if self.config.tensor_parallel_degree > 1: @@ -1419,6 +1487,11 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float16") expanded_attn_mask = expanded_attn_mask.astype("float16") expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype) + elif get_env_device() == "xpu": + x = paddle.to_tensor(0.0, dtype=dtype) + y = paddle.to_tensor(paddle.finfo(dtype).min, dtype=dtype) + expanded_attn_mask = expanded_attn_mask.astype(dtype) + expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype) else: expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) return expanded_attn_mask @@ -1543,6 +1616,7 @@ def forward( attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype ) # [bs, 1, seq_len, seq_len] + is_casual = False if self.config.use_flash_attention: if get_env_device() != "npu": is_casual = is_casual_mask(attention_mask) @@ -1587,6 +1661,7 @@ def forward( past_key_value, use_cache, alibi=alibi, + npu_is_casual=is_casual, ) # NOTE: clear outdate cache after it has been used for memory saving @@ -1698,6 +1773,15 @@ def __init__(self, config: LlamaConfig): self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: self.weight.split_axis = 1 + if get_env_device() == "xpu": + try: + from paddle_xpu.layers.nn import ( # noqa: F401 + parallel_matmul as xpu_parallel_matmul, + ) + + self.xpu_parallel_matmul = xpu_parallel_matmul() + except ImportError: + self.xpu_parallel_matmul = None def forward(self, hidden_states, tensor_parallel_output=None): if self.config.sequence_parallel: @@ -1711,7 +1795,12 @@ def forward(self, hidden_states, tensor_parallel_output=None): if tensor_parallel_output is None: tensor_parallel_output = self.config.tensor_parallel_output - logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) + if get_env_device() == "xpu" and self.xpu_parallel_matmul is not None: + logits = self.xpu_parallel_matmul( + hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output, training=self.training + ) + else: + logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) return logits diff --git a/paddlenlp/transformers/llama/modeling_pp.py b/paddlenlp/transformers/llama/modeling_pp.py index 73600aa6b420..dd2a91814231 100644 --- a/paddlenlp/transformers/llama/modeling_pp.py +++ b/paddlenlp/transformers/llama/modeling_pp.py @@ -210,6 +210,7 @@ class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): config_class = LlamaConfig _get_tensor_parallel_mappings = LlamaPretrainedModel._get_tensor_parallel_mappings + _get_fuse_or_split_param_mappings = LlamaPretrainedModel._get_fuse_or_split_param_mappings _init_weights = LlamaPretrainedModel._init_weights _keys_to_ignore_on_load_unexpected = LlamaPretrainedModel._keys_to_ignore_on_load_unexpected diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 9c9af9bbc694..722bde20ee70 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -108,9 +108,6 @@ def unwrap_optimizer(optimizer, optimizer_instances=()): if is_safetensors_available(): - - # from safetensors import safe_open - # from safetensors.numpy import load_file as safe_load_file from safetensors.numpy import save_file as safe_save_file from paddlenlp.utils.safetensors import fast_load_file as safe_load_file @@ -1841,6 +1838,25 @@ def _find_mismatched_keys( del state_dict[checkpoint_key] return mismatched_keys + def _fuse_or_split_keys( + state_dict, config, loaded_keys, pre_tensor_parallel_split=False, resume_state_dict=None + ): + if resume_state_dict is not None: + state_dict.update(resume_state_dict) + + before_fuse_keys = list(state_dict.keys()) + if pre_tensor_parallel_split: + tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True) + else: + tp_actions = None + state_dict, resume_state_dict = cls.convert_fuse_and_split(config, state_dict, tp_actions) + after_fuse_keys = list(state_dict.keys()) + + fused_keys = list(set(before_fuse_keys) - set(after_fuse_keys)) + new_keys = list(set(after_fuse_keys) - set(before_fuse_keys)) + + return state_dict, resume_state_dict, fused_keys, new_keys + if state_dict is not None: # DONT Hold tensor parallel here, only hold afer load state dict. # Whole checkpoint @@ -1850,6 +1866,16 @@ def _find_mismatched_keys( state_dict = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict) + # have loaded all state_dict, no resume state_dict + state_dict, _, fused_keys, new_keys = _fuse_or_split_keys( + state_dict, + config, + loaded_keys, + pre_tensor_parallel_split=True if config.tensor_parallel_degree > 1 else False, + ) + missing_keys = list(set(missing_keys) - set(new_keys)) + unexpected_keys = list(set(unexpected_keys) - set(fused_keys)) + mismatched_keys = _find_mismatched_keys( state_dict, model_state_dict, @@ -1881,7 +1907,7 @@ def _find_mismatched_keys( error_msgs = [] mismatched_keys = [] - + resume_state_dict = {} if len(resolved_archive_file) > 1: resolved_archive_file = tqdm(resolved_archive_file, desc="Loading checkpoint shards") @@ -1894,13 +1920,42 @@ def _find_mismatched_keys( ): pre_tensor_parallel_split = True assert loaded_keys is not None, "loaded_keys is not None." - tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys) + tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True) # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors + filter_dict_keys = set(expected_keys) + fuse_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True) + split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False) + for k in list(fuse_actions.keys()): + need_add_except_key = k[-1] in expected_keys + if need_add_except_key: + filter_dict_keys |= set(k[:-1]) + for k in list(split_actions.keys()): + need_add_except_key = False + for item in k[:-1]: + if item in expected_keys: + need_add_except_key = True + break + if need_add_except_key: + filter_dict_keys.add(k[-1]) + + if config.quantization_config.is_weight_quantize(): + filter_dict_keys = None + state_dict = load_state_dict( - shard_file, - tp_actions if pre_tensor_parallel_split else None, - None if config.quantization_config.is_weight_quantize() else set(expected_keys), + shard_file, tp_actions if pre_tensor_parallel_split else None, filter_dict_keys + ) + + # convert for fusing or splitting weights + state_dict, resume_state_dict, fused_keys, new_keys = _fuse_or_split_keys( + state_dict, + config, + loaded_keys, + pre_tensor_parallel_split=pre_tensor_parallel_split, + resume_state_dict=resume_state_dict, ) + missing_keys = list(set(missing_keys) - set(new_keys)) + unexpected_keys = list(set(unexpected_keys) - set(fused_keys)) + if config.quantization_config.is_weight_quantize(): state_dict = convert_to_quantize_state_dict( state_dict, diff --git a/paddlenlp/transformers/opt/configuration.py b/paddlenlp/transformers/opt/configuration.py index 866da043198e..3f6f23c1c65d 100644 --- a/paddlenlp/transformers/opt/configuration.py +++ b/paddlenlp/transformers/opt/configuration.py @@ -146,6 +146,8 @@ def __init__( eos_token_id=2, enable_bias: bool = True, mp_degree: int = 1, + fuse_attention_qkv=False, + fuse_attention_ffn=False, **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -165,3 +167,6 @@ def __init__( self.enable_bias = enable_bias self.mp_degree = mp_degree + + self.fuse_attention_qkv = fuse_attention_qkv + self.fuse_attention_ffn = fuse_attention_ffn diff --git a/paddlenlp/transformers/opt/modeling.py b/paddlenlp/transformers/opt/modeling.py index c9217f316415..bf1cec55eb16 100644 --- a/paddlenlp/transformers/opt/modeling.py +++ b/paddlenlp/transformers/opt/modeling.py @@ -649,6 +649,49 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True): return actions + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: OPTConfig, is_fuse=False): + # return parameter fuse utils + from paddlenlp.transformers.conversion_utils import split_or_fuse_func + + fn = split_or_fuse_func(is_fuse=is_fuse) + + # last key is fused key, other keys are to be fused. + fuse_qkv_keys = ( + "decoder.layers.0.self_attn.q_proj.weight", + "decoder.layers.0.self_attn.k_proj.weight", + "decoder.layers.0.self_attn.v_proj.weight", + "decoder.layers.0.self_attn.qkv_proj.weight", + ) + fuse_qkv_bias_keys = ( + "decoder.layers.0.self_attn.q_proj.bias", + "decoder.layers.0.self_attn.k_proj.bias", + "decoder.layers.0.self_attn.v_proj.bias", + "decoder.layers.0.self_attn.qkv_proj.bias", + ) + num_heads = config.num_attention_heads + num_key_value_heads = getattr(config, "num_key_value_heads", num_heads) + fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False) + + final_actions = {} + if is_fuse: + if fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]: + new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys]) + final_actions[new_keys] = partial( + fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + else: + if not fuse_attention_qkv: + for i in range(config.num_hidden_layers): + for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]: + new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys]) + final_actions[new_keys] = partial( + fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads + ) + return final_actions + @classmethod def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] diff --git a/tests/transformers/test_conversion_common.py b/tests/transformers/test_conversion_common.py new file mode 100644 index 000000000000..d04929a7c7dd --- /dev/null +++ b/tests/transformers/test_conversion_common.py @@ -0,0 +1,258 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +import glob +import os +import tempfile +import unittest + +import paddle + +input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + + +def prepare_default_config(config): + config = copy.deepcopy(config) + config.hidden_size = 512 + config.num_layers = 2 + config.num_hidden_layers = 2 + config.num_attention_heads = 16 + config.num_key_value_heads = 16 + config.intermediate_size = config.hidden_size + config.word_embed_proj_dim = 512 + return config + + +def prepare_split_config(config): + config = prepare_default_config(config) + config = copy.deepcopy(config) + config.fuse_attention_qkv = False + config.fuse_attention_ffn = False + return config + + +def prepare_fuse_config(config): + config = prepare_default_config(config) + config = copy.deepcopy(config) + config.fuse_attention_qkv = True + config.fuse_attention_ffn = True + return config + + +def common_test_load(model_class, model_first, config_second, tempdir): + model_first.eval() + with paddle.no_grad(): + first = model_first(input_ids)[0] + + model_second = model_class.from_pretrained(tempdir, config=config_second) + model_second.eval() + with paddle.no_grad(): + second = model_second(input_ids)[0] + + assert paddle.allclose(paddle.mean(first), paddle.mean(second), atol=1e-5) + # assert paddle.allclose(first, second, atol=1e-4) + + files = glob.glob(tempdir + "/*") + for f in files: + os.remove(f) + + +def common_test_save_and_load(config_first, config_second, model_class): + model_first = model_class.from_config(config_first) + + with tempfile.TemporaryDirectory() as tempdir: + # test load pdparams: model.pdparams + model_first.save_pretrained(save_dir=tempdir) + common_test_load(model_class, model_first, config_second, tempdir) + + # test load shard pdparams: model-001-0f-008.pdparams + model_first.save_pretrained(tempdir, max_shard_size="5MB") + common_test_load(model_class, model_first, config_second, tempdir) + + # test save safetensors: model.safetensors + model_first.save_pretrained(tempdir, safe_serialization=True) + common_test_load(model_class, model_first, config_second, tempdir) + + # test load shard safetensors: model-001-0f-008.safetensors + model_first.save_pretrained(tempdir, max_shard_size="5MB", safe_serialization=True) + common_test_load(model_class, model_first, config_second, tempdir) + + +def _test_split_to_fuse(config_class, model_class): + config = config_class() + + config_split = prepare_split_config(config) + config_fuse = prepare_fuse_config(config) + + # Test from splitted weights to fused weight + common_test_save_and_load(config_split, config_fuse, model_class) + + +def _test_fuse_to_split(config_class, model_class): + config = config_class() + + config_split = prepare_split_config(config) + config_fuse = prepare_fuse_config(config) + + # Test from fused weight to splitted weights + common_test_save_and_load(config_fuse, config_split, model_class) + + +def _test_fast_ffn(): + from functools import partial + + import paddle + from paddle import nn + + from paddlenlp.transformers import PretrainedModel + from paddlenlp.transformers.configuration_utils import PretrainedConfig + + class TestConfig(PretrainedConfig): + def __init__(self, fast_ffn_state=False, convert_fast_ffn=False): + self.fast_ffn_state = fast_ffn_state + self.convert_fast_ffn = convert_fast_ffn + super().__init__() + + class TestMLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.hidden_size * 2, bias_attr=True) + + def forward(self, hidden_state): + hidden_state = self.gate_up_fused_proj(hidden_state) + if self.config.use_fast_ffn: + x, y = paddle.chunk(hidden_state, chunks=2, axis=-1) + else: + x, y = hidden_state[..., ::2], hidden_state[..., 1::2] + + return nn.functional.silu(x) * y + + class TestPretrainedModel(PretrainedModel): + config_class = TestConfig + + @classmethod + def _get_fuse_or_split_param_mappings(cls, config: TestConfig, is_fuse=False): + + # user defined function to get convert param mappings + def convert_fast_ffn_fn(fuse_params, convert_fast_ffn=False): + import numpy as np + + concat_fn = np.concatenate + if isinstance(fuse_params[0], paddle.Tensor): + concat_fn = paddle.concat + + if convert_fast_ffn: + # fast_ffn + first = fuse_params[0][..., ::2] + second = fuse_params[0][..., 1::2] + return concat_fn([first, second], axis=-1) + + fn = convert_fast_ffn_fn + + convert_fast_ffn_keys = ( + "layers.0.gate_up_fused_proj.weight", + "layers.0.gate_up_fused_proj.weight", + ) + convert_fast_ffn_bias_keys = ( + "layers.0.gate_up_fused_proj.bias", + "layers.0.gate_up_fused_proj.bias", + ) + fast_ffn_state = getattr(config, "fast_ffn_state", False) + convert_fast_ffn = getattr(config, "convert_fast_ffn", False) + convert_fast_ffn &= not fast_ffn_state + + final_actions = {} + if is_fuse: + # for_get_fuse_or_split_param_mappings, is_fuse have two conditions, true and false, + # to fit partial fuse or split conditions, is_fuse will called twice(True and False). + # thus, for this func, we only use one condition. + + # use_fast_ffn only in one condition + # convert when use_fast_ffn is False + if convert_fast_ffn: + for i in range(config.num_hidden_layers): + for keys in [convert_fast_ffn_keys, convert_fast_ffn_bias_keys]: + keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys]) + final_actions[keys] = partial(fn, convert_fast_ffn=convert_fast_ffn) + return final_actions + + def _init_weights(self, layer): + if isinstance(layer, (nn.Linear, nn.Embedding)): + if isinstance(layer.weight, paddle.Tensor): + layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=1.0, shape=layer.weight.shape)) + if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor): + layer.bias.set_value(paddle.tensor.normal(mean=0.0, std=1.0, shape=layer.bias.shape)) + + class TestModel(TestPretrainedModel): + def __init__(self, config): + super().__init__(config) + self.layers = nn.LayerList([TestMLP(config=config) for i in range(config.num_hidden_layers)]) + + def forward(self, hidden_state): + for idx, (decoder_layer) in enumerate(self.layers): + hidden_state = decoder_layer(hidden_state) + return hidden_state + + class TestForCausalLM(TestPretrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.embedding_layer = nn.Embedding(65535, self.config.hidden_size) + self.test = TestModel(config=config) + + def forward(self, input_ids): + hidden_state = self.embedding_layer(input_ids) + return self.test(hidden_state) + + config = TestConfig() + config = prepare_default_config(config) + config_no_fast_ffn = copy.deepcopy(config) + config_fast_ffn = copy.deepcopy(config) + + config_no_fast_ffn.use_fast_ffn = False + + config_fast_ffn.use_fast_ffn = True + config_fast_ffn.fast_ffn_state = False + config_fast_ffn.convert_fast_ffn = True + + common_test_save_and_load(config_no_fast_ffn, config_fast_ffn, TestForCausalLM) + + +from paddlenlp.transformers import ( + GPTConfig, + GPTForCausalLM, + LlamaConfig, + LlamaForCausalLM, + OPTConfig, + OPTForCausalLM, +) + + +class TestFuseOrSplit(unittest.TestCase): + def test_model_split_to_fuse(self): + _test_split_to_fuse(LlamaConfig, LlamaForCausalLM) + _test_split_to_fuse(GPTConfig, GPTForCausalLM) + _test_split_to_fuse(OPTConfig, OPTForCausalLM) + + def test_model_fuse_to_split(self): + _test_fuse_to_split(LlamaConfig, LlamaForCausalLM) + _test_fuse_to_split(GPTConfig, GPTForCausalLM) + _test_fuse_to_split(OPTConfig, OPTForCausalLM) + + def test_model_convert_fast_ffn(self): + _test_fast_ffn() From 82a71775424043042a8c672cd1e9fc09348fd594 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Wed, 29 May 2024 16:50:59 +0800 Subject: [PATCH 20/27] quick fix os.path.split (#8508) --- paddlenlp/transformers/model_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 722bde20ee70..f56ad381f36e 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -1916,7 +1916,7 @@ def _fuse_or_split_keys( if ( shard_file.endswith(".safetensors") and config.tensor_parallel_degree > 1 - and "tp" not in os.path.spilt(shard_file)[-1] + and "tp" not in os.path.split(shard_file)[-1] ): pre_tensor_parallel_split = True assert loaded_keys is not None, "loaded_keys is not None." From 4d33655aa064a70c878983d5f2e05dc1d30dc2fc Mon Sep 17 00:00:00 2001 From: Ferrebo Date: Mon, 3 Jun 2024 17:44:02 +0800 Subject: [PATCH 21/27] [fea] Cherry-picked MOE updates from develop (#8531) * [fea] moe support (#8498) Co-authored-by: kebo01 * [fix] Broadcast optimizer state using broadcast_dp without shard-reshard. (#8522) --- docs/trainer.md | 4 + paddlenlp/trainer/trainer.py | 104 +++++++++++++++------- paddlenlp/trainer/training_args.py | 30 ++++++- paddlenlp/trainer/utils/helper.py | 59 ++++++++++++ paddlenlp/trainer/utils/reshard/common.py | 22 ++++- paddlenlp/trainer/utils/sharding_io.py | 24 +++-- 6 files changed, 198 insertions(+), 45 deletions(-) diff --git a/docs/trainer.md b/docs/trainer.md index beab064bdf22..23139ed6102d 100644 --- a/docs/trainer.md +++ b/docs/trainer.md @@ -705,4 +705,8 @@ Trainer 是一个简单,但功能完整的 Paddle训练和评估模块,并 Whether use flatten_param_grads method in optimizer, only used on NPU devices.(default:False) + --use_expert_parallel + Whether to enable MoE (Mixture of Experts) expert parallel training. + (default: False) + ``` diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index f507b5c8b92f..116c3451f95f 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -143,6 +143,7 @@ from .utils import reshard as reshard_util from .utils.helper import ( # nested_truncate, broadcast_dp_optimizer, + broadcast_moe_optimizer, distributed_concat, distributed_file, distributed_isfile, @@ -565,7 +566,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None): ) self.model.set_state_dict(state_dict) else: - if resume_from_checkpoint is not None and self.args.dataset_rank == 0: + if resume_from_checkpoint is not None and (self.args.dataset_rank == 0 or self.args.use_expert_parallel): weights_file = os.path.join( resume_from_checkpoint, _add_variant(weight_name, self.args.weight_name_suffix) @@ -581,7 +582,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None): weights_index_file, ] ): - raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint} -- {weights_file}") logger.info(f"Loading model from {resume_from_checkpoint} .") @@ -930,22 +931,17 @@ def _inner_training_loop( self.control = self.callback_handler.on_step_begin(args, self.state, self.control) self.timers and self.timers("forward-backward").start() - dp_enabled = ( - self.args.data_parallel_degree > 1 if self.args.use_hybrid_parallel else args.local_rank != -1 - ) - forbidden_no_sync = False # stage2 and stage3 should not no_sync, because the is no DDP wrapper and no_sync API # hybrid_parallel (tp or pp or sharding stage 1) should not no_sync - if self.args.use_hybrid_parallel: - forbidden_no_sync = True - - availiable_no_sync = dp_enabled and not forbidden_no_sync - + availiable_no_sync = hasattr(model, "no_sync") is_no_sync = ( - ((step_control + 1) % args.gradient_accumulation_steps != 0) - and availiable_no_sync - and args._no_sync_in_gradient_accumulation - ) or (args.recompute and availiable_no_sync) + ( + ((step_control + 1) % args.gradient_accumulation_steps != 0) + and args._no_sync_in_gradient_accumulation + ) + or args.recompute + or args.use_expert_parallel + ) and availiable_no_sync # sharding # stage1. the same as ddp # stage2. manualy collect gradient on dp group @@ -965,6 +961,14 @@ def _inner_training_loop( tr_loss += tr_loss_step + def fused_allreduce_gradients_no_sync(paramlist, hcg): + paramlist = list(paramlist) + nonmoe_list = [p for p in paramlist if not getattr(p, "no_sync", False)] + moelist = [p for p in paramlist if getattr(p, "no_sync", False)] + if moelist and not self.args.use_expert_parallel: + logger.warning("found `no sync` param when `use_expert_parallel=False`") + fused_allreduce_gradients(nonmoe_list, hcg) + if (step_control + 1) % args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps steps_in_epoch <= args.gradient_accumulation_steps @@ -983,12 +987,12 @@ def _inner_training_loop( # Case 1: Use recompute and dp / sharding stage1, # manualy collect gradient for dp. - if args.recompute and availiable_no_sync: - fused_allreduce_gradients(list(model.parameters()), None) + if (args.recompute or args.use_expert_parallel) and availiable_no_sync: + fused_allreduce_gradients_no_sync(list(model.parameters()), None) # Case 2: hack dp with master_grad - if dp_master_grad and not (args.recompute and availiable_no_sync): - fused_allreduce_gradients(list(model.parameters()), None) + elif dp_master_grad: + fused_allreduce_gradients_no_sync(list(model.parameters()), None) # Pipeline parallel mode, handle gradient reduce here to overlap pipeline_parallel_config = ( @@ -1007,8 +1011,7 @@ def _inner_training_loop( self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg) if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False): - fused_allreduce_gradients(list(parameters_list), self.optimizer._hcg) - + fused_allreduce_gradients_no_sync(list(parameters_list), self.optimizer._hcg) self.timers and self.timers("all-reduce").stop() self.timers and self.timers("optimizer-step").start() @@ -1028,6 +1031,8 @@ def _inner_training_loop( ) optimizer_was_run = True if self.do_grad_scaling: + if args.pipeline_parallel_degree > 1: + assert not self.args.use_expert_parallel, "pipeline moe not work under fp16" scale_before = paddle.assign(self.scaler._scale) self.scaler.step(self.optimizer) self.scaler.update() @@ -2042,7 +2047,6 @@ def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, model.train() inputs = self._prepare_inputs(inputs) - with self.autocast_smart_context_manager(): loss = self.compute_loss(model, inputs) @@ -2053,7 +2057,6 @@ def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, self.scaler.scale(loss).backward() else: loss.backward() - return loss.detach() def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor: @@ -2143,6 +2146,26 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op # For ckpt integrity paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done")) + def _filter_moe_no_sync_optimizer_params(self): + """ + filter optimizer params which should not sync + """ + state_dict = self.model.state_dict() + optimzier_state_dict = self.optimizer.state_dict() + filter_optimzier_state_dict = OrderedDict() + param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys()) if self.args.bf16 else [] + filter_optimzier_state_dict["master_weights"] = OrderedDict() + for k, v in state_dict.items(): + if getattr(v, "no_sync", False): + if v.name in param_names_in_master_weights: + filter_optimzier_state_dict["master_weights"][v.name] = optimzier_state_dict["master_weights"][ + v.name + ] + for op_k, op_v in optimzier_state_dict.items(): + if op_k.startswith(v.name): + filter_optimzier_state_dict[op_k] = op_v + return filter_optimzier_state_dict + def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model" self.runtime_timer.start("checkpoint saving time") @@ -2165,7 +2188,7 @@ def _save_checkpoint(self, model, metrics=None): optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix) if self.args.use_hybrid_parallel: - if self.dp_group.rank <= 0: + if self.dp_group.rank <= 0 or self.args.use_expert_parallel: os.makedirs(output_dir, exist_ok=True) logger.info("Saving optimizer files.") if self.args.unified_checkpoint: @@ -2177,12 +2200,18 @@ def _save_checkpoint(self, model, metrics=None): safe_serialization=True, ) else: - self._save_ckpt_func( - self.optimizer.state_dict(), - os.path.join(output_dir, optimizer_name), - ) + if self.dp_group.rank > 0: # this should only work for MoE saving + self._save_ckpt_func( + self._filter_moe_no_sync_optimizer_params(), + os.path.join(output_dir, optimizer_name), + ) + else: + self._save_ckpt_func( + self.optimizer.state_dict(), + os.path.join(output_dir, optimizer_name), + ) - if self.args.should_save: + if self.args.should_save or self.args.use_expert_parallel: if not self.args.use_hybrid_parallel: logger.info("Saving optimizer files.") if self.args.unified_checkpoint: @@ -2194,7 +2223,12 @@ def _save_checkpoint(self, model, metrics=None): safe_serialization=True, ) else: - self._save_ckpt_func(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + if self.args.data_parallel_rank > 0 and self.args.use_expert_parallel: + self._save_ckpt_func( + self._filter_moe_no_sync_optimizer_params(), os.path.join(output_dir, OPTIMIZER_NAME) + ) + else: + self._save_ckpt_func(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) # FIXME: maybe only save one copy paddle.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) @@ -2452,7 +2486,7 @@ def _load_optimizer_and_scheduler(self, checkpoint): logger.info("Loading checkpoint, the next checkpoint will be saved as unified checkpoint") if not use_unified_checkpoint: - if self.args.data_parallel_rank == 0: + if self.args.data_parallel_rank == 0 or self.args.use_expert_parallel: optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix) path = os.path.join(checkpoint, optimizer_name) if os.path.isfile(path): @@ -2476,7 +2510,13 @@ def _load_optimizer_and_scheduler(self, checkpoint): # broadcast optimizer state in dp group if self.args.local_rank != -1: dist.barrier() - opt_state_dict = broadcast_dp_optimizer(opt_state_dict) + if self.args.use_expert_parallel: + opt_state_dict = broadcast_moe_optimizer( + opt_state_dict, broadcast_dp=not self.args.should_load_sharding_stage1_model + ) + else: + if not self.args.should_load_sharding_stage1_model: + opt_state_dict = broadcast_dp_optimizer(opt_state_dict) if opt_state_dict is not None: # Load in optimizer and scheduler states diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 3118178608d2..f825c308ebb8 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -791,6 +791,10 @@ class TrainingArguments: default=False, metadata={"help": "whether to output logits in distributed status"}, ) + use_expert_parallel: Optional[bool] = field( + default=False, + metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"}, + ) def __post_init__(self): env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1)) @@ -1117,6 +1121,8 @@ def is_segment_parallel_supported(): order = ["dp", "sharding", "pp", "sep", "mp"] else: order = ["dp", "sharding", "pp", "mp"] + if self.use_expert_parallel: + order = order[1:-1] + ["dp", "mp"] if is_segment_parallel_supported(): hybrid_configs = { @@ -1598,9 +1604,12 @@ def optimizer_name_suffix(self): if self.sharding_parallel_degree > 1: assert self.sharding_parallel_degree < 100, "sharding parallel degree should be less than 100." name.append(f"shard{self.sharding_parallel_rank:0>2d}") - + if self.use_expert_parallel: + name.append(f"moe{self.data_parallel_rank:0>2d}") return "_".join(name) else: + if self.use_expert_parallel: + return f"moe{self.data_parallel_rank:0>2d}" return None @property @@ -1613,12 +1622,16 @@ def weight_name_suffix(self): if self.pipeline_parallel_degree > 1: assert self.pipeline_parallel_degree < 100, "tensor parallel rank should be less than 100." name.append(f"pp{self.pipeline_parallel_rank:0>2d}") + if self.use_expert_parallel: + name.append(f"moe{self.data_parallel_rank:0>2d}") return "_".join(name) else: + if self.use_expert_parallel: + return f"moe{self.data_parallel_rank:0>2d}" return None - def sharded_name_suffix(self, shard_id=None, pp_id=None): + def sharded_name_suffix(self, shard_id=None, pp_id=None, moe_id=None): if self.use_hybrid_parallel: name = [] if self.tensor_parallel_degree > 1: @@ -1636,8 +1649,17 @@ def sharded_name_suffix(self, shard_id=None, pp_id=None): assert isinstance(shard_id, int) assert shard_id < 100, "shard_id should be less than 100." name.append(f"shard{shard_id:0>2d}") + if self.use_expert_parallel: + if moe_id is None: + moe_id = self.data_parallel_rank + assert isinstance(moe_id, int) + name.append(f"moe{moe_id:0>2d}") return "_".join(name) else: + if self.use_expert_parallel: + if moe_id is None: + moe_id = self.data_parallel_rank + return self._format_name("moe", moe_id, self.data_parallel_degree) return None @property @@ -1730,9 +1752,9 @@ def should_save_model_state(self): return True elif self.use_hybrid_parallel: # save on dataset rank 0 - return self.sharding_parallel_rank == 0 and self.data_parallel_rank == 0 + return self.sharding_parallel_rank == 0 and (self.data_parallel_rank == 0 or self.use_expert_parallel) else: - return self.process_index == 0 + return self.process_index == 0 or self.use_expert_parallel @property def _no_sync_in_gradient_accumulation(self): diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py index 25f593f71e35..8e4c22e908f5 100644 --- a/paddlenlp/trainer/utils/helper.py +++ b/paddlenlp/trainer/utils/helper.py @@ -226,3 +226,62 @@ def broadcast_dp_optimizer(state_dict): state_dict = nested_broadcast_tensor(state_dict, src=src_rank, group=dp_group) return state_dict + + +def broadcast_moe_optimizer(state_dict, broadcast_dp=True): + + try: + hcg = fleet.get_hybrid_communicate_group() + dp_group = hcg.get_data_parallel_group() + src_rank = hcg.get_data_parallel_group_src_rank() + data_parallel_rank = hcg.get_data_parallel_rank() + # Don't broadcast optimizer for dp rank is 1. + if dp_group.nranks <= 1: + return state_dict + except: + dp_group = None + src_rank = 0 + data_parallel_rank = 0 + + def _broadcast_moe_optimizer_state(state_dict): + # boardcast_keys + base_state_dict = {"master_weights": {}} + buf = [ + {i: j.shape for i, j in state_dict.items() if i not in ["master_weights", "LR_Scheduler"]}, + {i: j.shape for i, j in state_dict["master_weights"].items()}, + {"LR_Scheduler": state_dict.get("LR_Scheduler", {})}, + ] + + dist.broadcast_object_list(buf, src=src_rank, group=dp_group) + # logger.info(f"moe-optimizer-gather-keys{buf}") + for k, s in buf[0].items(): + v = state_dict.get(k, paddle.zeros(s, "float32")).cuda() + v.name = k + # k = k.replace("_fp32_master_0", "") + dist.broadcast(v, src=src_rank, group=dp_group) + logger.info(f"broadcast moe optimizer {k} from {src_rank}") + base_state_dict[k] = v.cpu() + for k, s in buf[1].items(): + v = state_dict["master_weights"].get(k, paddle.zeros(s, "float32")).cuda() + v.name = k + dist.broadcast(v, src=src_rank, group=dp_group) + logger.info(f"broadcast moe optimizer-master_weights {k} from {src_rank}") + base_state_dict["master_weights"][k] = v.cpu() + base_state_dict.update(buf[2]) + return base_state_dict + + if broadcast_dp: + base_state_dict = broadcast_dp_optimizer(state_dict) + else: + base_state_dict = _broadcast_moe_optimizer_state(state_dict) + if data_parallel_rank > 0: + master_weight = state_dict.pop("master_weights", {}) + base_state_dict.update(state_dict) + if master_weight: + if "master_weights" in base_state_dict: + base_state_dict["master_weights"].update(master_weight) + else: + base_state_dict["master_weights"] = master_weight + state_dict = base_state_dict + del base_state_dict + return state_dict diff --git a/paddlenlp/trainer/utils/reshard/common.py b/paddlenlp/trainer/utils/reshard/common.py index cc834862e299..66e3c3569916 100644 --- a/paddlenlp/trainer/utils/reshard/common.py +++ b/paddlenlp/trainer/utils/reshard/common.py @@ -266,6 +266,16 @@ def _opt_name_to_tname(tensor_names, opt_names): all_names.extend(opt_names) all_names.sort() pre_t_name = "" + suffix = [ + "_fp32_master_0_beta1_pow_acc_0", + "_fp32_master_0_beta2_pow_acc_0", + "_fp32_master_0_moment1_0", + "_fp32_master_0_moment2_0", + "_beta1_pow_acc_0", + "_beta2_pow_acc_0", + "_moment1_0", + "_moment2_0", + ] opt_to_t = {} for n in all_names: if n in tensor_names: @@ -274,6 +284,16 @@ def _opt_name_to_tname(tensor_names, opt_names): else: assert pre_t_name opt_to_t[n] = pre_t_name + + for t in opt_names: + _find = False + for s in suffix: + if t.endswith(s): + logger.info(f"{t}-{t[:-len(s)]}--{t[:-len(s)] in tensor_names}") + opt_to_t[t] = t[: -len(s)] + _find = True + break + assert _find return opt_to_t if structure_name_mapping is not None: @@ -291,7 +311,7 @@ def _opt_name_to_tname(tensor_names, opt_names): (self._model_weights, model_weights_tmp) = (model_weights_tmp, self._model_weights) for k in list(model_weights_tmp.keys()): t_name = structure_name_mapping[k] - self._model_weights[(k, t_name)] = model_weights_tmp[k].cpu() + self._model_weights[(k, t_name)] = paddle.to_tensor(model_weights_tmp[k]).cpu() del model_weights_tmp[k] # opt diff --git a/paddlenlp/trainer/utils/sharding_io.py b/paddlenlp/trainer/utils/sharding_io.py index 56f4c426ce0a..4fe55d175005 100644 --- a/paddlenlp/trainer/utils/sharding_io.py +++ b/paddlenlp/trainer/utils/sharding_io.py @@ -67,11 +67,14 @@ def filter_sharded_params(state_dict, optimizer, sharding_group): if reshard_util.get_sharding_strategy(optimizer) == reshard_util.SHARDING_STRATEGY_V1: optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizer) for (k, v) in state_dict.items(): - assert v.name in optimizer._param2rank - sharded_rank = optimizer._param2rank[v.name] - if sharded_rank != sharding_rank: - continue - filtered_state_dict[k] = v + if v.name in optimizer._param2rank: + sharded_rank = optimizer._param2rank[v.name] + if sharded_rank != sharding_rank: + continue + filtered_state_dict[k] = v + else: + if sharding_rank == 0: + filtered_state_dict[k] = v else: optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizerV2) parameters = optimizer._parameter_list @@ -352,7 +355,7 @@ def manipulate_state_dict_and_config(self, model_to_save, merge_tensor_parallel= ) logger.info( "param_names_in_master_weights len:{}, bf16 state_dict len:{}, :{}".format( - len(param_names_in_master_weights), len(state_dict), state_dict + len(param_names_in_master_weights), len(state_dict), state_dict.keys() ) ) return state_dict, config_to_save, weight_name_suffix @@ -444,12 +447,17 @@ def filter_func(name): master_weights = reshard_util.all_gather_state_dict(master_weights, filter_func, self.sharding_group) model_state_dict = self.model.state_dict() + logger.info(f"state-dict-keys: {state_dict.keys()}, nums: {len(state_dict.keys())}") logger.info("before recover, model_state_dict number: {}".format(len(model_state_dict))) for key, param in model_state_dict.items(): if param.name in master_weights: assert param.shape == master_weights[param.name].shape - paddle.assign(master_weights[param.name].cuda(), model_state_dict[key]) - + paddle.assign(paddle.cast(master_weights[param.name].cuda(), paddle.bfloat16), model_state_dict[key]) + elif key in state_dict: + logger.info(f"key: {key} is in state_dict, but not in master_weights") + paddle.assign(state_dict[key], model_state_dict[key]) + else: + logger.info(f"key: {key} is not in state_dict and master_weights") logger.info("after recover, casted model_state_dict number: {}".format(len(model_state_dict))) state_dict.update(model_state_dict) return state_dict From 6757ff9d436baa20284b411648b6029b3b377e2c Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 3 Jun 2024 20:13:07 +0800 Subject: [PATCH 22/27] [LLM] relocate tensor_parallel_output to avoid conflict (#8419) (#8533) Co-authored-by: Tian <121000916+SylarTiaNII@users.noreply.github.com> --- llm/finetune_generation.py | 5 +++++ llm/utils.py | 2 +- paddlenlp/trainer/training_args.py | 4 ---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py index c8fed17165af..6e4123b02df2 100644 --- a/llm/finetune_generation.py +++ b/llm/finetune_generation.py @@ -16,6 +16,7 @@ import sys from dataclasses import dataclass, field from functools import partial +from typing import Optional import paddle from argument import ( @@ -66,6 +67,10 @@ class FinetuneArguments(TrainingArguments): default=0, metadata={"help": "The steps use to control the learing rate."}, ) + tensor_parallel_output: Optional[bool] = field( + default=False, + metadata={"help": "whether to output logits in distributed status"}, + ) def read_local_dataset(path): diff --git a/llm/utils.py b/llm/utils.py index 6688357bd67b..3075943877df 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -212,7 +212,7 @@ def prediction_step( if isinstance(logits, (list, tuple)): logits = logits[0] # all gather logits when enabling tensor_parallel_output - if self.args.tensor_parallel_degree > 1 and self.args.tensor_parallel_output: + if self.args.tensor_parallel_degree > 1 and getattr(self.args, "tensor_parallel_output", False): hcg = fleet.get_hybrid_communicate_group() model_parallel_group = hcg.get_model_parallel_group() gathered_logits = [] diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index f825c308ebb8..423d77d6f510 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -787,10 +787,6 @@ class TrainingArguments: default=False, metadata={"help": "whether to run distributed training in auto parallel mode"}, ) - tensor_parallel_output: Optional[bool] = field( - default=False, - metadata={"help": "whether to output logits in distributed status"}, - ) use_expert_parallel: Optional[bool] = field( default=False, metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"}, From 7c8d713de8475c807f53818eafe4c160e4fab1f0 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Wed, 5 Jun 2024 17:16:26 +0800 Subject: [PATCH 23/27] Update sequence_parallel for predict (#8547) --- paddlenlp/trainer/trainer.py | 6 +++++- paddlenlp/transformers/linear_utils.py | 14 +++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 116c3451f95f..dfc47354c493 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -40,7 +40,11 @@ import paddle.nn as nn from packaging import version from paddle import framework -from paddle.base import core + +try: + from paddle.base import core +except: + core = None from paddle.distributed import fleet from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import ( HybridParallelOptimizer, diff --git a/paddlenlp/transformers/linear_utils.py b/paddlenlp/transformers/linear_utils.py index de1a0f886b79..469e7c45985e 100644 --- a/paddlenlp/transformers/linear_utils.py +++ b/paddlenlp/transformers/linear_utils.py @@ -18,7 +18,11 @@ import paddle.distributed.fleet.meta_parallel as mpu from paddle import nn -from paddle.distributed.fleet.utils import sequence_parallel_utils + +try: + from paddle.distributed.fleet.utils import sequence_parallel_utils +except: + sequence_parallel_utils = None from paddlenlp.transformers.mc2_parallel_linear import ( MC2ColumnSeqParallelLinear, @@ -29,8 +33,12 @@ Linear = nn.Linear ColumnParallelLinear = mpu.ColumnParallelLinear RowParallelLinear = mpu.RowParallelLinear -ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear -RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear +try: + ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear + RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear +except: + ColumnSequenceParallelLinear = None + RowSequenceParallelLinear = None if get_env_device() == "npu": if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None: From c628f129483384cf87a8d219bb5728490ae638bd Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Fri, 7 Jun 2024 17:55:54 +0800 Subject: [PATCH 24/27] Cp/fix (#8569) * [Safetensors] Fix fast safe open slice. (#8512) * [FIX DDP] fix ddp (#8549) --- paddlenlp/trainer/trainer.py | 12 +- paddlenlp/trainer/training_args.py | 2 +- paddlenlp/utils/safetensors.py | 12 +- pyproject.toml | 4 +- tests/trainer/test_lora_unified_checkpoint.py | 47 ++++---- tests/trainer/test_unified_checkpoint.py | 109 ++++++++++-------- tests/transformers/test_safetensors.py | 14 ++- 7 files changed, 107 insertions(+), 93 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index dfc47354c493..b42e596e97e4 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1771,16 +1771,8 @@ def _wrap_model(self, model, training=True): in_sep_parallel_mode = self.args.sep_parallel_degree > 1 # Multi-gpu training - if ( - self.args.world_size > 1 - and not self.args.use_hybrid_parallel - or not ( - in_pipeline_parallel_mode - or in_sharding_parallel_mode - or in_tensor_parallel_mode - or in_sep_parallel_mode - ) - ): + if self.args.world_size > 1 and (not self.args.use_hybrid_parallel): + # MOE use DDP to broadcaset parameters. model = paddle.DataParallel(model) # Distributed training (should be after fp16 initialization) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 423d77d6f510..b31e55d7b4f0 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1406,7 +1406,7 @@ def is_segment_parallel_supported(): if world_size > 1: if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(): if self.unified_checkpoint: - self.use_hybrid_parallel = True + # DP use hybrid group strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) else: diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py index 422a7d09961c..c273d0d973c2 100644 --- a/paddlenlp/utils/safetensors.py +++ b/paddlenlp/utils/safetensors.py @@ -157,16 +157,16 @@ def __getitem__(self, index): out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step)) for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)): - out_start[i] = slice_.start or 0 - out_step[i] = slice_.step or 1 - out_stop[i] = slice_.stop or stop - start + out_start[i] = slice_.start if slice_.start is not None else 0 + out_step[i] = slice_.step if slice_.step is not None else 1 + out_stop[i] = slice_.stop if slice_.stop is not None else stop - start out_stop[i] = min(stop, out_stop[i]) target_shape = [] - for x, y, z in zip(out_start, out_stop, out_step): + for x, y, z, sli in zip(out_start, out_stop, out_step, index): assert z == 1, "only support step = 1" - if y - x > 1: - target_shape.append(int(y - x)) + if y - x > 1 or sli.step is None: + target_shape.append(max(int(y - x), 0)) if len(target_shape) == 0: if self.shape == [1]: diff --git a/pyproject.toml b/pyproject.toml index 715323d09e37..858508037fce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ exclude = ['.flake8'] [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q --ignore model_zoo/gpt-3/" +addopts = "-ra -q --dist loadgroup" pythonpath = ["."] testpaths = [ "tests/data", @@ -28,7 +28,7 @@ testpaths = [ "tests/prompt", # "tests/taskflow", TODO (paddle 2.5.1 breaks this test suite, debug later) "tests/utils", - "model_zoo", + # "model_zoo", ] python_files = [ "test.py", diff --git a/tests/trainer/test_lora_unified_checkpoint.py b/tests/trainer/test_lora_unified_checkpoint.py index 98d5516d2388..0abfc257d4f7 100644 --- a/tests/trainer/test_lora_unified_checkpoint.py +++ b/tests/trainer/test_lora_unified_checkpoint.py @@ -149,7 +149,7 @@ def __test__(cls): def setUp(self): """ - 1. update runfrist and rerun to run defined different config + 1. update runfirst and rerun to run defined different config 2. update need_allclose to True if you want to check the result 3. update rtol to the relative value you want to check """ @@ -169,7 +169,7 @@ def setUp(self): self.run_lora_file = "llm/finetune_generation.py" - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_lora_file, **train_args) def rerun(self, train_args): @@ -181,7 +181,7 @@ def testTP4PP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4PP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -196,7 +196,7 @@ def testTP2Sharding4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP2Sharding4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -213,7 +213,7 @@ def testTP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -227,7 +227,7 @@ def testTP4DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -242,7 +242,7 @@ def testTP4Sharding2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -257,7 +257,7 @@ def testTP2PP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["TP2PP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -272,7 +272,7 @@ def testPP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -287,7 +287,7 @@ def testPP4DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -302,7 +302,7 @@ def testPP4Sharding2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["PP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -317,7 +317,7 @@ def testSharding8S1(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding8S1"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -332,7 +332,7 @@ def testSharding8S2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding8S2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -347,7 +347,7 @@ def testSharding4S1DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding4S1DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -362,7 +362,7 @@ def testSharding4S2DP2(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding4S2DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -377,7 +377,7 @@ def testSharding2S1DP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding2S1DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -392,7 +392,7 @@ def testSharding2S2DP4(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["Sharding2S2DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -407,7 +407,7 @@ def testDP8(self): remove_ckpt(lora_arguments["output_dir"]) train_args = self.configs["DP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -416,19 +416,21 @@ def testDP8(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase): def setUp(self): super().setUp() self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_lora_file, **train_args) def rerun(self, train_args): self.run_n2c4(self.run_lora_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -436,7 +438,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_lora_file, **train_args) @@ -445,6 +447,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_lora_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -452,7 +455,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_lora_file, **train_args) @@ -469,7 +472,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n2c4(self.run_lora_file, **train_args) diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index f8cc0ed7bfac..5ce99b36ff19 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -175,7 +175,7 @@ def __test__(cls): def setUp(self): """ - 1. update runfrist and rerun to run defined diffrent config + 1. update runfirst and rerun to run defined diffrent config 2. update need_allclose to True if you want to check the result 3. update rtol to the relative value you want to check """ @@ -194,7 +194,7 @@ def setUp(self): self.run_pretrain_file = "llm/llama/run_pretrain.py" - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -206,7 +206,7 @@ def testTP4PP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4PP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -221,7 +221,7 @@ def testTP2Sharding4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP2Sharding4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -238,7 +238,7 @@ def testTP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -252,7 +252,7 @@ def testTP4DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -267,7 +267,7 @@ def testTP4Sharding2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -282,7 +282,7 @@ def testTP2PP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["TP2PP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -297,7 +297,7 @@ def testPP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -312,7 +312,7 @@ def testPP4DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP4DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -327,7 +327,7 @@ def testPP4Sharding2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["PP4Sharding2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -342,7 +342,7 @@ def testSharding8S1(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding8S1"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -357,7 +357,7 @@ def testSharding8S2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding8S2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -372,7 +372,7 @@ def testSharding4S1DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding4S1DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -387,7 +387,7 @@ def testSharding4S2DP2(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding4S2DP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -402,7 +402,7 @@ def testSharding2S1DP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding2S1DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -417,7 +417,7 @@ def testSharding2S2DP4(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["Sharding2S2DP4"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -432,7 +432,7 @@ def testDP8(self): remove_ckpt(pretrain_arguments["output_dir"]) train_args = self.configs["DP8"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -441,13 +441,14 @@ def testDP8(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN2C4(TestUnifiedCheckpointBase): def setUp(self): super().setUp() self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -463,7 +464,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -485,7 +486,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -507,7 +508,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -529,7 +530,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -557,7 +558,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -576,7 +577,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n1c8(self.run_pretrain_file, **train_args) @@ -585,6 +586,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8MasterWeightCompatibleO2ToO1(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -596,7 +598,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n1c8(self.run_pretrain_file, **train_args) @@ -605,6 +607,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8CheckpointCompatible(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -612,7 +615,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_pretrain_file, **train_args) @@ -621,6 +624,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C8Reset(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -628,7 +632,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c8(self.run_pretrain_file, **train_args) @@ -637,6 +641,7 @@ def rerun(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestPaddleCheckpointOnN1C2Reset(TestMultipleGpus): def setUp(self): self.configs = get_pretrain_arguments(pretrain_arguments) @@ -653,7 +658,7 @@ def setUp(self): self.run_pretrain_file = "llm/llama/run_pretrain.py" - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n1c2(self.run_pretrain_file, **train_args) @@ -669,7 +674,7 @@ def testTP2(self): train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -678,6 +683,7 @@ def testTP2(self): np.testing.assert_allclose(res[0], res[1], self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C2Reset(TestMultipleGpus): def setUp(self): self.configs = get_pretrain_arguments(pretrain_arguments) @@ -714,7 +720,7 @@ def setUp(self): "training_args.bin", ] - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 1 self.run_n1c2(self.run_pretrain_file, **train_args) @@ -730,7 +736,7 @@ def testTP2(self): train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) self.rerun(train_args) if self.need_allclose: @@ -748,7 +754,7 @@ def testFileLists(self): base_ckpt_path = os.path.join(pretrain_arguments["output_dir"], "checkpoint-%d" % save_steps) train_args = self.configs["TP2"] - self.runfrist(train_args) + self.runfirst(train_args) assert sorted(self.filelists) == sorted(os.listdir(base_ckpt_path)) self.rerun(train_args) @@ -761,7 +767,7 @@ def testFileLists(self): remove_logs() remove_ckpt(pretrain_arguments["output_dir"]) train_args["unified_checkpoint_config"] = "skip_save_model_weight" - self.runfrist(train_args) + self.runfirst(train_args) unsave_filelists = [ "master_weights-00001-of-00002.safetensors", "master_weights-00002-of-00002.safetensors", @@ -788,7 +794,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -809,7 +815,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -828,7 +834,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n2c4(self.run_pretrain_file, **train_args) @@ -849,7 +855,7 @@ def setUp(self): self.need_allclose = False - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n2c4(self.run_pretrain_file, **train_args) @@ -866,7 +872,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["unified_checkpoint"] = 0 self.run_n2c4(self.run_pretrain_file, **train_args) @@ -886,7 +892,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -909,7 +915,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -937,7 +943,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -967,7 +973,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -995,7 +1001,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) move_checkpoint_N1C8_to_N2C4() @@ -1023,7 +1029,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1051,7 +1057,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O1" self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1081,7 +1087,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): train_args["fp16_opt_level"] = "O2" self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1109,7 +1115,7 @@ def setUp(self): self.rtol = 1e-4 self.k = MAX_CONVERT_CONFIGS # max: 16, min: 1 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n2c4(self.run_pretrain_file, **train_args) move_checkpoint_N2C4_to_N1C8() @@ -1123,6 +1129,7 @@ def rerun(self, train_args): np.testing.assert_allclose(res[0], res[-1], rtol=self.rtol) +@pytest.mark.skipif(True, reason="Skip for None CE") class TestUnifiedCheckpointOnN1C8EnableAll(TestUnifiedCheckpointBase): def setUp(self): super().setUp() @@ -1133,7 +1140,7 @@ def setUp(self): self.need_allclose = True self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, **train_args) def rerun(self, train_args): @@ -1153,7 +1160,7 @@ def setUp(self): self.need_allclose = False self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, log_dir="log_uc", **train_args) def rerun(self, train_args): @@ -1172,7 +1179,7 @@ def setUp(self): self.need_allclose = False self.rtol = 1e-7 - def runfrist(self, train_args): + def runfirst(self, train_args): self.run_n1c8(self.run_pretrain_file, log_dir="log_pd", **train_args) def rerun(self, train_args): diff --git a/tests/transformers/test_safetensors.py b/tests/transformers/test_safetensors.py index 3c143e26a0b5..85b291e42349 100644 --- a/tests/transformers/test_safetensors.py +++ b/tests/transformers/test_safetensors.py @@ -28,7 +28,14 @@ class FastSafetensors(unittest.TestCase): def setUp(self): super().setUp() self.weigth_map = {} - tensors = [([10, 10], "float32"), ([8], "float16"), ([5, 5, 5], "int32")] + tensors = [ + ([10, 1, 10], "float32"), + ([1, 1, 10], "float32"), + ([1, 1, 1, 10], "float32"), + ([10, 10], "float32"), + ([8], "float16"), + ([5, 5, 5], "int32"), + ] count = 0 for shape, dtype in tensors: self.weigth_map[f"weight_{count}"] = (np.random.random(shape) * 100).astype(dtype) @@ -53,5 +60,10 @@ def test_safe_open(self): with fast_safe_open(path, framework="np") as f: for key in f.keys(): safe_slice = f.get_slice(key) + # np.testing.assert_equal(self.weigth_map[key][2:1, ...], safe_slice[2:1, ...]) + np.testing.assert_equal(self.weigth_map[key][0, ...], safe_slice[0, ...]) + np.testing.assert_equal(self.weigth_map[key][0:1, ...], safe_slice[0:1, ...]) + np.testing.assert_equal(self.weigth_map[key][..., 2:], safe_slice[..., 2:]) + np.testing.assert_equal(self.weigth_map[key][..., 1], safe_slice[..., 1]) np.testing.assert_equal(self.weigth_map[key][:2, ...], safe_slice[:2, ...]) np.testing.assert_equal(self.weigth_map[key][..., :4], safe_slice[..., :4]) From 5b027c8ae5260342fd58ced5162ebafa4766cb40 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Fri, 7 Jun 2024 18:00:11 +0800 Subject: [PATCH 25/27] Don't save moe_group (#8570) --- paddlenlp/transformers/configuration_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index 4bda24695a48..b3e255f30535 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -903,6 +903,8 @@ def to_dict(self) -> Dict[str, Any]: output["model_type"] = self.__class__.model_type if "_auto_class" in output: del output["_auto_class"] + if "moe_group" in output: + del output["moe_group"] output["quantization_config"] = self.quantization_config.to_dict() From db99efd4dc99047922aae9842be66ab4538f93bf Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Thu, 20 Jun 2024 15:41:29 +0800 Subject: [PATCH 26/27] release 2.8.1 (#8636) --- paddlenlp/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py index e3cd7e1c5f75..9e3fef0146bb 100644 --- a/paddlenlp/__init__.py +++ b/paddlenlp/__init__.py @@ -18,7 +18,7 @@ PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION" -__version__ = "2.8.0.post" +__version__ = "2.8.1.post" if os.getenv(PADDLENLP_STABLE_VERSION): __version__ = __version__.replace(".post", "") diff --git a/setup.py b/setup.py index 0723cfc28cb4..372d86776293 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def show(): f.write(content) -__version__ = "2.8.0.post" +__version__ = "2.8.1.post" if os.getenv(PADDLENLP_STABLE_VERSION): __version__ = __version__.replace(".post", "") From ad271a648b0da4049f7d3f720f5f4b4244d7d333 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Wed, 3 Jul 2024 11:03:03 +0800 Subject: [PATCH 27/27] [Safetensors] Fix safetensors shape (#8702) * Update sequence_parallel for predict * Do not save moe_group * Fix safetensors reading --- paddlenlp/utils/safetensors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/utils/safetensors.py b/paddlenlp/utils/safetensors.py index c273d0d973c2..54256023db7d 100644 --- a/paddlenlp/utils/safetensors.py +++ b/paddlenlp/utils/safetensors.py @@ -177,7 +177,7 @@ def __getitem__(self, index): span = self.bits for i, (start, stop, step) in enumerate(zip(out_start[::-1], out_stop[::-1], out_step[::-1])): if len(indices) == 0: - if start == 0 and stop == self.shape[i]: + if start == 0 and stop == self.shape[::-1][i]: pass # We haven't started to slice yet, just increase the span else: @@ -194,7 +194,7 @@ def __getitem__(self, index): newindices.append((old_start + offset, old_stop + offset)) indices = newindices assert len(indices) == capacity, f"error {capacity} {len(indices)}" - span *= self.shape[-(i + 1)] + span *= self.shape[::-1][i] if len(indices) == 0: indices.append((0, self.nbytes))