From a57e222722bc2559f32d34bfa1f126520adaff37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=B1=80?= Date: Tue, 24 Oct 2023 11:47:37 +0800 Subject: [PATCH 01/24] mistral model init commit --- paddlenlp/transformers/mistral/__init__.py | 13 + .../transformers/mistral/configuration.py | 147 ++ paddlenlp/transformers/mistral/modeling.py | 1244 +++++++++++++++++ 3 files changed, 1404 insertions(+) create mode 100644 paddlenlp/transformers/mistral/__init__.py create mode 100644 paddlenlp/transformers/mistral/configuration.py create mode 100644 paddlenlp/transformers/mistral/modeling.py diff --git a/paddlenlp/transformers/mistral/__init__.py b/paddlenlp/transformers/mistral/__init__.py new file mode 100644 index 000000000000..595add0aed9e --- /dev/null +++ b/paddlenlp/transformers/mistral/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlenlp/transformers/mistral/configuration.py b/paddlenlp/transformers/mistral/configuration.py new file mode 100644 index 000000000000..7cab5dc66ca9 --- /dev/null +++ b/paddlenlp/transformers/mistral/configuration.py @@ -0,0 +1,147 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Mistral model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + +logger = logging.get_logger(__name__) + +MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", + "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", +} + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py new file mode 100644 index 000000000000..d2ce294942c0 --- /dev/null +++ b/paddlenlp/transformers/mistral/modeling.py @@ -0,0 +1,1244 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +import math +from functools import partial +from typing import List, Optional, Tuple, Union + +import paddle +import paddle.nn.functional as F +from paddle import nn +from paddle.distributed.fleet.utils import recompute +from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from paddlenlp.transformers.conversion_utils import ( + StateDictNameMapping, + init_name_mappings, +) +from paddlenlp.utils.log import logger + +from ..activations import ACT2FN +from ..modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from ..modeling_utils import PretrainedModel + +# from ...utils import ( +# add_start_docstrings, +# add_start_docstrings_to_model_forward, +# is_flash_attn_2_available, +# logging, +# replace_return_docstrings, +# ) +from .configuration import MistralConfig + + +def is_flash_attn_2_available(): + return False + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + +_CONFIG_FOR_DOC = "MistralConfig" + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(padding_mask): + seqlens_in_batch = padding_mask.sum(dim=-1, dtype=paddle.int32) + indices = paddle.nonzero(padding_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, dim=0, dtype=paddle.paddle.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _make_sliding_window_causal_mask( + input_ids_shape: paddle.Size, + dtype: paddle.dtype, + device: paddle.device, + past_key_values_length: int = 0, + sliding_window: int = 4096, +): + """ + Make causal mask used for sliding window attention + """ + bsz, tgt_len = input_ids_shape + + tensor = paddle.full( + (tgt_len, tgt_len), + fill_value=1, + device=device, + ) + mask = paddle.tril(tensor, diagonal=0) + # make the mask banded to account for sliding window + mask = paddle.triu(mask, diagonal=-sliding_window) + mask = paddle.log(mask).to(dtype) + + if past_key_values_length > 0: + mask = paddle.cat([paddle.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(paddle.bool), paddle.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class MistralRMSNorm(nn.Layer): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(paddle.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(paddle.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +class MistralRotaryEmbedding(nn.Layer): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `paddle.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=paddle.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = paddle.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = paddle.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = paddle.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim] + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MistralMLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: + """ + This is the equivalent of paddle.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MistralAttention(nn.Layer): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: MistralConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = MistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): + return tensor.reshape(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + padding_mask: Optional[paddle.Tensor] = None, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = paddle.cat([past_key_value[0], key_states], dim=2) + value_states = paddle.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = paddle.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=paddle.float32).to(query_states.dtype) + attn_output = paddle.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class MistralFlashAttention2(MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + padding_mask: Optional[paddle.LongTensor] = None, + ): + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and hasattr(self.config, "sliding_window") is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window: + slicing_tokens = kv_seq_len - self.config.sliding_window + + past_key = past_key_value[0] + past_value = past_key_value[1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + past_key_value = (past_key, past_value) + + if padding_mask is not None: + padding_mask = padding_mask[:, slicing_tokens:] + padding_mask = paddle.cat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], dim=-1) + + key_states = paddle.cat([past_key_value[0], key_states], dim=2) + value_states = paddle.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # TODO: Mistral does not have dropout in the config?? + # It is recommended to use dropout with FA according to the docs + # when training. + dropout_rate = 0.0 # if not self.training else self.attn_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == paddle.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + padding_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + padding_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`paddle.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`paddle.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`paddle.Tensor`): + Input value states to be passed to Flash Attention API + padding_mask (`paddle.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + # Contains at least one padding token in the sequence + if padding_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, padding_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=True, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=True, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=True, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != padding_mask.shape[-1]: + padding_mask_num_tokens = padding_mask.shape[-1] + padding_mask = padding_mask[:, padding_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = paddle.arange( + batch_size + 1, dtype=paddle.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + padding_mask = padding_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class MistralDecoderLayer(nn.Layer): + def __init__(self, config: MistralConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = ( + MistralAttention(config=config) + if not getattr(config, "_flash_attn_2_enabled", False) + else MistralFlashAttention2(config) + ) + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + padding_mask: Optional[paddle.Tensor] = None, + ) -> Tuple[paddle.FloatTensor, Optional[Tuple[paddle.FloatTensor, paddle.FloatTensor]]]: + """ + Args: + hidden_states (`paddle.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(paddle.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + padding_mask=padding_mask, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class MistralPreTrainedModel(PretrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + + @classmethod + def _get_name_mappings(cls, config: MistralConfig) -> list[StateDictNameMapping]: + mappings: list[StateDictNameMapping] = [] + model_mappings = [ + ["embed_tokens.weight"], + ["norm.weight"], + ] + for layer_index in range(config.num_hidden_layers): + layer_mappings = [ + [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], + [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"], + [f"layers.{layer_index}.input_layernorm.weight"], + [f"layers.{layer_index}.post_attention_layernorm.weight"], + ] + model_mappings.extend(layer_mappings) + + init_name_mappings(mappings=model_mappings) + # base-model prefix "LlamaModel" + if "LlamaModel" not in config.architectures: + for mapping in model_mappings: + mapping[0] = "model." + mapping[0] + mapping[1] = "llama." + mapping[1] + model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) + + mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] + return mappings + + @classmethod + def _get_tensor_parallel_mappings(cls, config: MistralConfig, is_split=True): + + from paddlenlp.transformers.conversion_utils import split_or_merge_func + + fn = split_or_merge_func( + is_split=is_split, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + ) + + def get_tensor_parallel_split_mappings(num_layers): + final_actions = {} + + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), + } + + # Column Linear + if config.fuse_attention_qkv: + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) + else: + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + + if config.fuse_attention_ffn: + base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( + fn, is_column=True, is_naive_2fuse=True + ) + else: + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) + + return mappings + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, MistralModel): + module.gradient_checkpointing = value + + +class MistralModel(MistralPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: MistralConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.LayerList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def _prepare_decoder_attention_mask( + self, attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window + ): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_sliding_window_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + sliding_window=sliding_window, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids: paddle.LongTensor = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_values: Optional[List[paddle.FloatTensor]] = None, + inputs_embeds: Optional[paddle.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = paddle.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long, device=device + ) + position_ids = position_ids.unsqueeze(0).reshape(-1, seq_length) + else: + position_ids = position_ids.reshape(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + padding_mask = None + + # embed positions + if attention_mask is None: + attention_mask = paddle.ones( + (batch_size, seq_length_with_past), dtype=paddle.bool, device=inputs_embeds.device + ) + elif 0 in attention_mask: + padding_mask = attention_mask + + if ( + padding_mask is not None + and hasattr(self.config, "_flash_attn_2_enabled") + and self.config._flash_attn_2_enabled + and past_key_values is not None + ): + is_padding_right = padding_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + if self.enable_recompute and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + has_gradient = not hidden_states.stop_gradient + if self.enable_recompute and has_gradient: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + padding_mask=padding_mask, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class MistralForCausalLM(MistralPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def forward( + self, + input_ids: paddle.LongTensor = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_values: Optional[List[paddle.FloatTensor]] = None, + inputs_embeds: Optional[paddle.FloatTensor] = None, + labels: Optional[paddle.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`paddle.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.reshape(-1, self.config.vocab_size) + shift_labels = shift_labels.reshape(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +class MistralForSequenceClassification(MistralPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MistralModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def forward( + self, + input_ids: paddle.LongTensor = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.LongTensor] = None, + past_key_values: Optional[List[paddle.FloatTensor]] = None, + inputs_embeds: Optional[paddle.FloatTensor] = None, + labels: Optional[paddle.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`paddle.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[paddle.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == paddle.long or labels.dtype == paddle.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.reshape(-1, self.num_labels), labels.reshape(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From e250362d2e31ae07653a1b7c3a4f801ad96d0589 Mon Sep 17 00:00:00 2001 From: Ting Date: Tue, 24 Oct 2023 20:26:12 +0800 Subject: [PATCH 02/24] code save --- paddlenlp/transformers/mistral/__init__.py | 2 + .../transformers/mistral/configuration.py | 5 +- paddlenlp/transformers/mistral/modeling.py | 429 +++++++++--------- 3 files changed, 210 insertions(+), 226 deletions(-) diff --git a/paddlenlp/transformers/mistral/__init__.py b/paddlenlp/transformers/mistral/__init__.py index 595add0aed9e..2e538f5c59f3 100644 --- a/paddlenlp/transformers/mistral/__init__.py +++ b/paddlenlp/transformers/mistral/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .modeling import MistralForCausalLM +from .configuration import MistralConfig diff --git a/paddlenlp/transformers/mistral/configuration.py b/paddlenlp/transformers/mistral/configuration.py index 7cab5dc66ca9..2340e62ca3a9 100644 --- a/paddlenlp/transformers/mistral/configuration.py +++ b/paddlenlp/transformers/mistral/configuration.py @@ -13,10 +13,7 @@ # limitations under the License. """ Mistral model configuration""" -from ...configuration_utils import PretrainedConfig -from ...utils import logging - -logger = logging.get_logger(__name__) +from ..configuration_utils import PretrainedConfig MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index d2ce294942c0..934bbac788b0 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -21,6 +21,7 @@ from paddle import nn from paddle.distributed.fleet.utils import recompute from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +#import paddle.distributed.fleet.meta_parallel as mpu from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping, @@ -29,12 +30,13 @@ from paddlenlp.utils.log import logger from ..activations import ACT2FN -from ..modeling_outputs import ( +from ..model_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast, + CausalLMOutputWithCrossAttentions, ) -from ..modeling_utils import PretrainedModel +from ..model_utils import PretrainedModel # from ...utils import ( # add_start_docstrings, @@ -61,10 +63,10 @@ def is_flash_attn_2_available(): # Copied from transformers.models.llama.modeling_llama._get_unpad_data def _get_unpad_data(padding_mask): - seqlens_in_batch = padding_mask.sum(dim=-1, dtype=paddle.int32) + seqlens_in_batch = padding_mask.sum(axis=-1, dtype=paddle.int32) indices = paddle.nonzero(padding_mask.flatten(), as_tuple=False).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, dim=0, dtype=paddle.paddle.int32), (1, 0)) + cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0, dtype=paddle.paddle.int32), (1, 0)) return ( indices, cu_seqlens, @@ -73,9 +75,8 @@ def _get_unpad_data(padding_mask): def _make_sliding_window_causal_mask( - input_ids_shape: paddle.Size, + input_ids_shape: paddle.shape, dtype: paddle.dtype, - device: paddle.device, past_key_values_length: int = 0, sliding_window: int = 4096, ): @@ -87,16 +88,15 @@ def _make_sliding_window_causal_mask( tensor = paddle.full( (tgt_len, tgt_len), fill_value=1, - device=device, ) mask = paddle.tril(tensor, diagonal=0) # make the mask banded to account for sliding window mask = paddle.triu(mask, diagonal=-sliding_window) - mask = paddle.log(mask).to(dtype) + mask = paddle.log(mask).astype(dtype) if past_key_values_length > 0: - mask = paddle.cat([paddle.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) - return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + mask = paddle.concat([paddle.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], axis=-1) + return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) # Copied from transformers.models.bart.modeling_bart._expand_mask @@ -104,14 +104,14 @@ def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ - bsz, src_len = mask.size() + bsz, src_len = mask.shape tgt_len = tgt_len if tgt_len is not None else src_len - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).astype(dtype) inverted_mask = 1.0 - expanded_mask - return inverted_mask.masked_fill(inverted_mask.to(paddle.bool), paddle.finfo(dtype).min) + return paddle.where(inverted_mask > 0.5, paddle.full_like(inverted_mask, paddle.finfo(dtype).min), inverted_mask) # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral @@ -121,51 +121,54 @@ def __init__(self, hidden_size, eps=1e-6): MistralRMSNorm is equivalent to T5LayerNorm """ super().__init__() - self.weight = nn.Parameter(paddle.ones(hidden_size)) + self.weight = paddle.create_parameter( + shape=[hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(paddle.float32) + hidden_states = hidden_states.astype(paddle.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) + return self.weight * hidden_states.astype(input_dtype) # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral class MistralRotaryEmbedding(nn.Layer): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base - inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) + self.inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).astype('float32') / self.dim)) # Build here to make `paddle.jit.trace` work. self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=paddle.get_default_dtype() + seq_len=max_position_embeddings, dtype=paddle.get_default_dtype() ) - def _set_cos_sin_cache(self, seq_len, device, dtype): + def _set_cos_sin_cache(self, seq_len, dtype): self.max_seq_len_cached = seq_len - t = paddle.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = paddle.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype) freqs = paddle.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = paddle.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + emb = paddle.concat((freqs, freqs), axis=-1) + self.cos_cached = emb.cos().astype(dtype) + self.sin_cached = emb.sin().astype(dtype) def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype) return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), + self.cos_cached[:seq_len].astype(dtype=x.dtype), + self.sin_cached[:seq_len].astype(dtype=x.dtype), ) @@ -174,7 +177,7 @@ def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] - return paddle.cat((-x2, x1), dim=-1) + return paddle.concat((-x2, x1), axis=-1) # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb @@ -192,9 +195,9 @@ def __init__(self, config): self.config = config self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): @@ -203,14 +206,14 @@ def forward(self, x): def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: """ - This is the equivalent of paddle.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + This is the equivalent of paddle.repeat_interleave(x, axis=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + hidden_states = hidden_states[:, :, None, :, :].expand([batch, num_key_value_heads, n_rep, slen, head_dim]) + return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim]) class MistralAttention(nn.Layer): @@ -235,10 +238,10 @@ def __init__(self, config: MistralConfig): f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {self.num_heads})." ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=False) self.rotary_emb = MistralRotaryEmbedding( self.head_dim, @@ -247,27 +250,27 @@ def __init__(self, config: MistralConfig): ) def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) def forward( self, hidden_states: paddle.Tensor, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, + position_ids: Optional[paddle.Tensor] = None, past_key_value: Optional[Tuple[paddle.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - bsz, q_len, _ = hidden_states.size() + bsz, q_len, _ = hidden_states.shape query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) + value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -277,8 +280,8 @@ def forward( if past_key_value is not None: # reuse k, v, self_attention - key_states = paddle.cat([past_key_value[0], key_states], dim=2) - value_states = paddle.cat([past_key_value[1], value_states], dim=2) + key_states = paddle.concat([past_key_value[0], key_states], axis=2) + value_states = paddle.concat([past_key_value[1], value_states], axis=2) past_key_value = (key_states, value_states) if use_cache else None @@ -286,34 +289,34 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = paddle.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" + f"Attention weights should be of size {[bsz, self.num_heads, q_len, kv_seq_len]}, but is" + f" {attn_weights.shape}" ) if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + f"Attention mask should be of size {[bsz, 1, q_len, kv_seq_len]}, but is {attention_mask.shape}" ) attn_weights = attn_weights + attention_mask # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=paddle.float32).to(query_states.dtype) + attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query_states.dtype) attn_output = paddle.matmul(attn_weights, value_states) - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]: raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" + f"`attn_output` should be of size {[bsz, self.num_heads, q_len, self.head_dim]}, but is" + f" {attn_output.shape}" ) - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = attn_output.transpose([0, 2, 1, 3]) + attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) attn_output = self.o_proj(attn_output) @@ -334,21 +337,21 @@ def forward( self, hidden_states: paddle.Tensor, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, + position_ids: Optional[paddle.Tensor] = None, past_key_value: Optional[Tuple[paddle.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, - padding_mask: Optional[paddle.LongTensor] = None, + padding_mask: Optional[paddle.Tensor] = None, ): - bsz, q_len, _ = hidden_states.size() + bsz, q_len, _ = hidden_states.shape query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) + value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -380,8 +383,8 @@ def forward( past_key = past_key_value[0] past_value = past_key_value[1] - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() + past_key = past_key[:, :, slicing_tokens:, :] + past_value = past_value[:, :, slicing_tokens:, :] if past_key.shape[-2] != self.config.sliding_window - 1: raise ValueError( @@ -393,10 +396,10 @@ def forward( if padding_mask is not None: padding_mask = padding_mask[:, slicing_tokens:] - padding_mask = paddle.cat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], dim=-1) + padding_mask = paddle.concat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], axis=-1) - key_states = paddle.cat([past_key_value[0], key_states], dim=2) - value_states = paddle.cat([past_key_value[1], value_states], dim=2) + key_states = paddle.concat([past_key_value[0], key_states], axis=2) + value_states = paddle.concat([past_key_value[1], value_states], axis=2) past_key_value = (key_states, value_states) if use_cache else None @@ -409,31 +412,10 @@ def forward( # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == paddle.float32: - # Handle the case where the model is quantized - if hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) + query_states = query_states.transpose([0, 2, 1, 3]) + key_states = key_states.transpose([0, 2, 1, 3]) + value_states = value_states.transpose([0, 2, 1, 3]) attn_output = self._flash_attention_forward( query_states, @@ -445,7 +427,7 @@ def forward( use_sliding_windows=use_sliding_windows, ) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) attn_output = self.o_proj(attn_output) if not output_attentions: @@ -553,12 +535,12 @@ def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_l indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) + value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) if query_length == kv_seq_len: query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k ) cu_seqlens_q = cu_seqlens_k max_seqlen_in_batch_q = max_seqlen_in_batch_k @@ -566,7 +548,7 @@ def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_l elif query_length == 1: max_seqlen_in_batch_q = 1 cu_seqlens_q = paddle.arange( - batch_size + 1, dtype=paddle.int32, device=query_layer.device + batch_size + 1, dtype=paddle.int32 ) # There is a memcpy here, that is very bad. indices_q = cu_seqlens_q[:-1] query_layer = query_layer.squeeze(1) @@ -602,16 +584,16 @@ def forward( self, hidden_states: paddle.Tensor, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, + position_ids: Optional[paddle.Tensor] = None, past_key_value: Optional[Tuple[paddle.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, padding_mask: Optional[paddle.Tensor] = None, - ) -> Tuple[paddle.FloatTensor, Optional[Tuple[paddle.FloatTensor, paddle.FloatTensor]]]: + ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: - hidden_states (`paddle.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`paddle.FloatTensor`, *optional*): attention mask of size + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`, *optional*): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -619,7 +601,7 @@ def forward( use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). - past_key_value (`Tuple(paddle.FloatTensor)`, *optional*): cached past key and value projection states + past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states """ residual = hidden_states @@ -676,7 +658,6 @@ def _get_name_mappings(cls, config: MistralConfig) -> list[StateDictNameMapping] [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], - [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"], [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"], [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"], @@ -687,10 +668,11 @@ def _get_name_mappings(cls, config: MistralConfig) -> list[StateDictNameMapping] init_name_mappings(mappings=model_mappings) # base-model prefix "LlamaModel" - if "LlamaModel" not in config.architectures: - for mapping in model_mappings: + for mapping in model_mappings: mapping[0] = "model." + mapping[0] - mapping[1] = "llama." + mapping[1] + mapping[1] = "model." + mapping[1] + + if "MistralModel" not in config.architectures: model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] @@ -749,21 +731,42 @@ def get_tensor_parallel_split_mappings(num_layers): return mappings - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, MistralModel): - module.gradient_checkpointing = value - + def _init_weights(self, layer): + """Initialization hook""" + if isinstance( + layer, + ( + nn.Linear, + nn.Embedding, + #mpu.VocabParallelEmbedding, + #mpu.ColumnParallelLinear, + #mpu.RowParallelLinear, + #ColumnSequenceParallelLinear, + #RowSequenceParallelLinear, + ), + ): + # In the dygraph mode, use the `set_value` to reset the parameter directly, + # and reset the `state_dict` to update parameter in static mode. + if isinstance(layer.weight, paddle.Tensor): + layer.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.llama.config.initializer_range, + shape=layer.weight.shape, + ) + ) + # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530 + # sublayer is init first + # scale RowParallelLinear weight + with paddle.no_grad(): + if isinstance(layer, MistralMLP): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.down_proj.weight.scale_(factor) + if isinstance(layer, MistralAttention): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.o_proj.weight.scale_(factor) class MistralModel(MistralPreTrainedModel): """ @@ -782,9 +785,7 @@ def __init__(self, config: MistralConfig): self.layers = nn.LayerList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)]) self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() + self.enable_recompute = False def get_input_embeddings(self): return self.embed_tokens @@ -802,16 +803,13 @@ def _prepare_decoder_attention_mask( combined_attention_mask = _make_sliding_window_causal_mask( input_shape, inputs_embeds.dtype, - device=inputs_embeds.device, past_key_values_length=past_key_values_length, sliding_window=sliding_window, ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) @@ -820,11 +818,11 @@ def _prepare_decoder_attention_mask( def forward( self, - input_ids: paddle.LongTensor = None, + input_ids: paddle.Tensor = None, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, - past_key_values: Optional[List[paddle.FloatTensor]] = None, - inputs_embeds: Optional[paddle.FloatTensor] = None, + position_ids: Optional[paddle.Tensor] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, @@ -856,13 +854,12 @@ def forward( seq_length_with_past = seq_length_with_past + past_key_values_length if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = paddle.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long, device=device + past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long ) - position_ids = position_ids.unsqueeze(0).reshape(-1, seq_length) + position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length]) else: - position_ids = position_ids.reshape(-1, seq_length).long() + position_ids = position_ids.reshape([-1, seq_length]).astype('int64') if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -872,9 +869,9 @@ def forward( # embed positions if attention_mask is None: attention_mask = paddle.ones( - (batch_size, seq_length_with_past), dtype=paddle.bool, device=inputs_embeds.device + (batch_size, seq_length_with_past), dtype=paddle.bool ) - elif 0 in attention_mask: + elif paddle.any(attention_mask == 0): padding_mask = attention_mask if ( @@ -978,10 +975,7 @@ def __init__(self, config): super().__init__(config) self.model = MistralModel(config) self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) def get_input_embeddings(self): return self.model.embed_tokens @@ -1001,14 +995,62 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def prepare_inputs_for_generation( + self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs + ): + batch_size, seq_length = input_ids.shape + position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length))) + attention_mask = kwargs.get("attention_mask", None) + if past_key_values: + input_ids = input_ids[:, -1].unsqueeze(axis=-1) + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False): + # update cache + if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor): + model_kwargs["past_key_values"] = outputs[1] + + if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs: + model_kwargs["past_key_values"] = outputs.past_key_values + + # update position_ids + if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None: + position_ids = model_kwargs["position_ids"] + model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1) + + if not is_encoder_decoder and "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = paddle.concat( + [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1 + ) + + return model_kwargs + def forward( self, - input_ids: paddle.LongTensor = None, + input_ids: paddle.Tensor = None, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, - past_key_values: Optional[List[paddle.FloatTensor]] = None, - inputs_embeds: Optional[paddle.FloatTensor] = None, - labels: Optional[paddle.LongTensor] = None, + position_ids: Optional[paddle.Tensor] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, @@ -1016,7 +1058,7 @@ def forward( ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: - labels (`paddle.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. @@ -1061,19 +1103,18 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.float() + logits = logits.astype('float32') loss = None if labels is not None: # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] # Flatten the tokens loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.reshape(-1, self.config.vocab_size) - shift_labels = shift_labels.reshape(-1) + shift_logits = shift_logits.reshape([-1, self.config.vocab_size]) + shift_labels = shift_labels.reshape([-1]) # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) if not return_dict: @@ -1088,65 +1129,12 @@ def forward( attentions=outputs.attentions, ) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - class MistralForSequenceClassification(MistralPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.model = MistralModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() + self.score = nn.Linear(config.hidden_size, self.num_labels, bias_attr=False) def get_input_embeddings(self): return self.model.embed_tokens @@ -1156,19 +1144,19 @@ def set_input_embeddings(self, value): def forward( self, - input_ids: paddle.LongTensor = None, + input_ids: paddle.Tensor = None, attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.LongTensor] = None, - past_key_values: Optional[List[paddle.FloatTensor]] = None, - inputs_embeds: Optional[paddle.FloatTensor] = None, - labels: Optional[paddle.LongTensor] = None, + position_ids: Optional[paddle.Tensor] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, SequenceClassifierOutputWithPast]: r""" - labels (`paddle.LongTensor` of shape `(batch_size,)`, *optional*): + labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). @@ -1200,17 +1188,14 @@ def forward( sequence_lengths = -1 else: if input_ids is not None: - sequence_lengths = (paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( - logits.device - ) + sequence_lengths = (paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1) else: sequence_lengths = -1 - pooled_logits = logits[paddle.arange(batch_size, device=logits.device), sequence_lengths] + pooled_logits = logits[paddle.arange(batch_size), sequence_lengths] loss = None if labels is not None: - labels = labels.to(logits.device) if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" @@ -1227,7 +1212,7 @@ def forward( loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.reshape(-1, self.num_labels), labels.reshape(-1)) + loss = loss_fct(pooled_logits.reshape([-1, self.num_labels]), labels.reshape([-1])) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) From 6029e665914a17cf626294144309b1373724bda2 Mon Sep 17 00:00:00 2001 From: Ting Date: Thu, 26 Oct 2023 17:03:37 +0800 Subject: [PATCH 03/24] add dump code --- paddlenlp/generation/utils.py | 1 + paddlenlp/transformers/mistral/modeling.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py index 9b025e7ee878..f9dfb70ac0cd 100644 --- a/paddlenlp/generation/utils.py +++ b/paddlenlp/generation/utils.py @@ -915,6 +915,7 @@ def generate( stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + print(generation_config.decode_strategy) if generation_config.decode_strategy == "greedy_search": if generation_config.num_return_sequences > 1: raise ValueError( diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 934bbac788b0..09929889a740 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -28,6 +28,7 @@ init_name_mappings, ) from paddlenlp.utils.log import logger +i=0 from ..activations import ACT2FN from ..model_outputs import ( @@ -263,10 +264,15 @@ def forward( padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: bsz, q_len, _ = hidden_states.shape + global i + import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) + import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) + import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) + import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) @@ -290,6 +296,7 @@ def forward( value_states = repeat_kv(value_states, self.num_key_value_groups) attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) + import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( @@ -319,6 +326,8 @@ def forward( attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) attn_output = self.o_proj(attn_output) + import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) + i += 1 if not output_attentions: attn_weights = None @@ -1104,6 +1113,7 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits.astype('float32') + import numpy as np; np.save('l', logits.astype('float32').numpy()) loss = None if labels is not None: From b1c3cf8c4eb98c9078e5a80b85ef7987a7ca8e4e Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Wed, 1 Nov 2023 17:54:54 +0800 Subject: [PATCH 04/24] mistral support lora,prefix,multi-gpu --- llm/data.py | 2 +- llm/mistral/lora_argument.json | 32 +++ llm/mistral/pt_argument.json | 30 +++ llm/mistral/sft_argument.json | 29 +++ llm/utils.py | 10 +- paddlenlp/transformers/__init__.py | 2 + paddlenlp/transformers/auto/modeling.py | 1 + paddlenlp/transformers/mistral/modeling.py | 233 ++++++++++++++++----- 8 files changed, 282 insertions(+), 57 deletions(-) create mode 100644 llm/mistral/lora_argument.json create mode 100644 llm/mistral/pt_argument.json create mode 100644 llm/mistral/sft_argument.json diff --git a/llm/data.py b/llm/data.py index 3a39643c096b..5fcf4ecb7770 100644 --- a/llm/data.py +++ b/llm/data.py @@ -24,7 +24,7 @@ def get_convert_example(model): if base_model_prefix == "chatglm": return convert_example_chatglm - elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen"]: + elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mistral"]: return convert_example_common else: raise ValueError( diff --git a/llm/mistral/lora_argument.json b/llm/mistral/lora_argument.json new file mode 100644 index 000000000000..f0c5d0175817 --- /dev/null +++ b/llm/mistral/lora_argument.json @@ -0,0 +1,32 @@ +{ + "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", + "dataset_name_or_path": "./data1", + "output_dir": "./checkpoints/mistral_lora_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "steps", + "save_strategy": "steps", + "eval_steps": 20, + "save_steps": 60, + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "recompute": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true + } diff --git a/llm/mistral/pt_argument.json b/llm/mistral/pt_argument.json new file mode 100644 index 000000000000..eceb0f5768d1 --- /dev/null +++ b/llm/mistral/pt_argument.json @@ -0,0 +1,30 @@ +{ + "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/mistral_pt_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-02, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "prefix_tuning": true + } diff --git a/llm/mistral/sft_argument.json b/llm/mistral/sft_argument.json new file mode 100644 index 000000000000..fca6739b4af3 --- /dev/null +++ b/llm/mistral/sft_argument.json @@ -0,0 +1,29 @@ +{ + "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/mistral_sft_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 4, + "pipeline_parallel_degree": 1 + } diff --git a/llm/utils.py b/llm/utils.py index a32a5acb4e59..89b805193343 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -86,6 +86,14 @@ def get_prefix_tuning_params(model): hidden_size = model.config.hidden_size postprocess_past_key_value = qwen_postprocess_past_key_value multi_query_group_num = None + elif model.base_model_prefix == "mistral": + from paddlenlp.peft.prefix import llama_postprocess_past_key_value + + num_attention_heads = model.config.num_attention_heads + num_hidden_layers = model.config.num_hidden_layers + hidden_size = model.config.hidden_size + postprocess_past_key_value = llama_postprocess_past_key_value + multi_query_group_num = model.config.num_attention_heads // model.config.num_key_value_heads else: raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}. ") return dict( @@ -112,7 +120,7 @@ def get_lora_target_modules(model): ] elif model.base_model_prefix == "bloom": target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"] - elif model.base_model_prefix == "llama" or isinstance(model, LlamaForCausalLMPipe): + elif model.base_model_prefix == "llama" or model.base_model_prefix == "mistral" or isinstance(model, LlamaForCausalLMPipe): target_modules = [ ".*q_proj.*", ".*v_proj.*", diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 5697dc8d6052..33e4c29fec0e 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -282,6 +282,8 @@ from .qwen.modeling import * from .qwen.configuration import * from .qwen.tokenizer import * +from .mistral.modeling import * +from .mistral.configuration import * # For faster tokenizer from ..utils.import_utils import is_fast_tokenizer_available diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 6e07d112f02d..f6164a4e7431 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -127,6 +127,7 @@ ("Blip", "blip"), ("Bloom", "bloom"), ("QWen", "qwen"), + ("Mistral", "mistral"), ] ) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 09929889a740..6ab27753fcdc 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -20,15 +20,16 @@ import paddle.nn.functional as F from paddle import nn from paddle.distributed.fleet.utils import recompute +import paddle.distributed.fleet.meta_parallel as mpu +from paddle.distributed import fleet from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -#import paddle.distributed.fleet.meta_parallel as mpu from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping, init_name_mappings, ) from paddlenlp.utils.log import logger -i=0 +#i=0 from ..activations import ACT2FN from ..model_outputs import ( @@ -100,7 +101,6 @@ def _make_sliding_window_causal_mask( return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) -# Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. @@ -115,7 +115,6 @@ def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int return paddle.where(inverted_mask > 0.5, paddle.full_like(inverted_mask, paddle.finfo(dtype).min), inverted_mask) -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral class MistralRMSNorm(nn.Layer): def __init__(self, hidden_size, eps=1e-6): """ @@ -173,7 +172,6 @@ def forward(self, x, seq_len=None): ) -# Copied from transformers.models.llama.modeling_llama.rotate_half def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -181,7 +179,6 @@ def rotate_half(x): return paddle.concat((-x2, x1), axis=-1) -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb def apply_rotary_pos_emb(q, k, cos, sin, position_ids): cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim] sin = sin[position_ids].unsqueeze(1) @@ -196,9 +193,31 @@ def __init__(self, config): self.config = config self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) + if config.tensor_parallel_degree > 1: + self.gate_proj = mpu.ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + self.up_proj = mpu.ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + + self.down_proj = mpu.RowParallelLinear( + self.intermediate_size, + self.hidden_size, + input_is_parallel=True, + has_bias=False, + ) + else: + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) + self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): @@ -239,10 +258,63 @@ def __init__(self, config: MistralConfig): f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {self.num_heads})." ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=False) + if config.tensor_parallel_degree > 1: + if self.num_key_value_heads % config.tensor_parallel_degree != 0: + raise ValueError( + f"num_key_value_heads must be divisible by tensor_parallel_degree (got `num_key_value_heads`: {self.num_key_value_heads}" + f" and `tensor_parallel_degree`: {config.tensor_parallel_degree})." + ) + + self.q_proj = mpu.ColumnParallelLinear( + self.hidden_size, + self.num_heads * self.head_dim, + has_bias=False, + gather_output=False, + ) + self.k_proj = mpu.ColumnParallelLinear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + has_bias=False, + gather_output=False, + ) + self.v_proj = mpu.ColumnParallelLinear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + has_bias=False, + gather_output=False, + ) + else: + self.q_proj = nn.Linear( + self.hidden_size, + self.num_heads * self.head_dim, + bias_attr=False, + ) + self.k_proj = nn.Linear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias_attr=False, + ) + self.v_proj = nn.Linear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias_attr=False, + ) + + if config.tensor_parallel_degree > 1: + self.o_proj = mpu.RowParallelLinear( + self.num_heads * self.head_dim, + self.hidden_size, + has_bias=False, + input_is_parallel=True, + ) + self.num_heads = self.num_heads // config.tensor_parallel_degree + self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree + else: + self.o_proj = nn.Linear( + self.num_heads * self.head_dim, + self.hidden_size, + bias_attr=False, + ) self.rotary_emb = MistralRotaryEmbedding( self.head_dim, @@ -264,15 +336,15 @@ def forward( padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: bsz, q_len, _ = hidden_states.shape - global i - import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) + #global i + #import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) - import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) - import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) + #import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) + #import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) + #import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) @@ -296,7 +368,7 @@ def forward( value_states = repeat_kv(value_states, self.num_key_value_groups) attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) - import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) + #import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( @@ -323,11 +395,11 @@ def forward( ) attn_output = attn_output.transpose([0, 2, 1, 3]) - attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) + attn_output = attn_output.reshape([bsz, q_len, self.num_heads * self.head_dim]) attn_output = self.o_proj(attn_output) - import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) - i += 1 + #import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) + #i += 1 if not output_attentions: attn_weights = None @@ -648,7 +720,7 @@ def forward( class MistralPreTrainedModel(PretrainedModel): config_class = MistralConfig - base_model_prefix = "model" + base_model_prefix = "mistral" supports_gradient_checkpointing = True _no_split_modules = ["MistralDecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -711,22 +783,14 @@ def get_tensor_parallel_split_mappings(num_layers): } # Column Linear - if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) - else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) - # if we have enough num_key_value_heads to split, then split it. - if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - - if config.fuse_attention_ffn: - base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial( - fn, is_column=True, is_naive_2fuse=True - ) - else: - base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: @@ -747,11 +811,9 @@ def _init_weights(self, layer): ( nn.Linear, nn.Embedding, - #mpu.VocabParallelEmbedding, - #mpu.ColumnParallelLinear, - #mpu.RowParallelLinear, - #ColumnSequenceParallelLinear, - #RowSequenceParallelLinear, + mpu.VocabParallelEmbedding, + mpu.ColumnParallelLinear, + mpu.RowParallelLinear, ), ): # In the dygraph mode, use the `set_value` to reset the parameter directly, @@ -790,7 +852,18 @@ def __init__(self, config: MistralConfig): self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + if config.tensor_parallel_degree > 1: + self.embed_tokens = mpu.VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()), + ) + else: + self.embed_tokens = nn.Embedding( + config.vocab_size, + config.hidden_size, + self.padding_idx, + ) self.layers = nn.LayerList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)]) self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -976,6 +1049,60 @@ def custom_forward(*inputs): attentions=all_self_attns, ) +def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, tensor_parallel_output=True): + is_fleet_init = True + tensor_parallel_degree = 1 + try: + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + tensor_parallel_degree = hcg.get_model_parallel_world_size() + except: + is_fleet_init = False + + if paddle.in_dynamic_mode(): + y_is_distributed = y.is_distributed + else: + y_is_distributed = tensor_parallel_degree > 1 + + if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed: + # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg' + input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group) + logits = paddle.matmul(input_parallel, y, transpose_y=False) + + if tensor_parallel_output: + return logits + + return paddle.distributed.collective._c_concat(logits, group=model_parallel_group) + + else: + logits = paddle.matmul(x, y, transpose_y=False) + return logits + + +class MistralLMHead(nn.Layer): + def __init__(self, config: MistralConfig): + super(MistralLMHead, self).__init__() + self.config = config + if config.tensor_parallel_degree > 1: + vocab_size = config.vocab_size // config.tensor_parallel_degree + else: + vocab_size = config.vocab_size + + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + # Must set distributed attr for Tensor Parallel ! + self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False + if self.weight.is_distributed: + self.weight.split_axis = 1 + + def forward(self, hidden_states, tensor_parallel_output=None): + if tensor_parallel_output is None: + tensor_parallel_output = self.config.tensor_parallel_output + + logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) + return logits class MistralForCausalLM(MistralPreTrainedModel): _tied_weights_keys = ["lm_head.weight"] @@ -984,7 +1111,8 @@ def __init__(self, config): super().__init__(config) self.model = MistralModel(config) self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) + #self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) + self.lm_head = MistralLMHead(config) def get_input_embeddings(self): return self.model.embed_tokens @@ -1113,19 +1241,14 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits.astype('float32') - import numpy as np; np.save('l', logits.astype('float32').numpy()) + #import numpy as np; np.save('l', logits.astype('float32').numpy()) loss = None if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :] - shift_labels = labels[..., 1:] - # Flatten the tokens loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.reshape([-1, self.config.vocab_size]) - shift_labels = shift_labels.reshape([-1]) - # Enable model parallelism - loss = loss_fct(shift_logits, shift_labels) + #logits = logits.reshape([-1, self.config.vocab_size]) + #labels = labels.reshape([-1]) + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] From 251552fa6d6ddcb87900af1099ba5e48a1382c5e Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Wed, 1 Nov 2023 19:22:06 +0800 Subject: [PATCH 05/24] bug fix --- paddlenlp/transformers/mistral/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 6ab27753fcdc..97552d7ed22b 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -97,7 +97,7 @@ def _make_sliding_window_causal_mask( mask = paddle.log(mask).astype(dtype) if past_key_values_length > 0: - mask = paddle.concat([paddle.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], axis=-1) + mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype=dtype), mask], axis=-1) return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) From 5c1db6f5096ef4844f39b5f9a31c9d7fdd4a0f8c Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 10 Nov 2023 10:44:51 +0800 Subject: [PATCH 06/24] add flash attention --- paddlenlp/transformers/mistral/modeling.py | 526 +++++++++++---------- 1 file changed, 270 insertions(+), 256 deletions(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 97552d7ed22b..8e90be58111a 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -252,6 +252,7 @@ def __init__(self, config: MistralConfig): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta + self.use_flash_attention = getattr(config, "_flash_attn_2_enabled", False) if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -367,26 +368,39 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) - #import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) + if not self.use_flash_attention: + attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) + #import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) - if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: - raise ValueError( - f"Attention weights should be of size {[bsz, self.num_heads, q_len, kv_seq_len]}, but is" - f" {attn_weights.shape}" - ) - - if attention_mask is not None: - if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: + if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( - f"Attention mask should be of size {[bsz, 1, q_len, kv_seq_len]}, but is {attention_mask.shape}" + f"Attention weights should be of size {[bsz, self.num_heads, q_len, kv_seq_len]}, but is" + f" {attn_weights.shape}" ) - attn_weights = attn_weights + attention_mask + if attention_mask is not None: + if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: + raise ValueError( + f"Attention mask should be of size {[bsz, 1, q_len, kv_seq_len]}, but is {attention_mask.shape}" + ) - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query_states.dtype) - attn_output = paddle.matmul(attn_weights, value_states) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query_states.dtype) + attn_output = paddle.matmul(attn_weights, value_states) + else: + query_states = query_states.transpose([0,2,1,3]) + key_states = key_states.transpose([0,2,1,3]) + value_states = value_states.transpose([0,2,1,3]) + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + ) + attn_output = attn_output.transpose([0,2,1,3]) if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]: raise ValueError( @@ -407,245 +421,245 @@ def forward( return attn_output, attn_weights, past_key_value -class MistralFlashAttention2(MistralAttention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.Tensor] = None, - past_key_value: Optional[Tuple[paddle.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - padding_mask: Optional[paddle.Tensor] = None, - ): - bsz, q_len, _ = hidden_states.shape - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) - key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) - value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and hasattr(self.config, "sliding_window") is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window: - slicing_tokens = kv_seq_len - self.config.sliding_window - - past_key = past_key_value[0] - past_value = past_key_value[1] - - past_key = past_key[:, :, slicing_tokens:, :] - past_value = past_value[:, :, slicing_tokens:, :] - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - past_key_value = (past_key, past_value) - - if padding_mask is not None: - padding_mask = padding_mask[:, slicing_tokens:] - padding_mask = paddle.concat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], axis=-1) - - key_states = paddle.concat([past_key_value[0], key_states], axis=2) - value_states = paddle.concat([past_key_value[1], value_states], axis=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - # TODO: Mistral does not have dropout in the config?? - # It is recommended to use dropout with FA according to the docs - # when training. - dropout_rate = 0.0 # if not self.training else self.attn_dropout - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose([0, 2, 1, 3]) - key_states = key_states.transpose([0, 2, 1, 3]) - value_states = value_states.transpose([0, 2, 1, 3]) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - padding_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - padding_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`paddle.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`paddle.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`paddle.Tensor`): - Input value states to be passed to Flash Attention API - padding_mask (`paddle.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - # Contains at least one padding token in the sequence - if padding_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, padding_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=True, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=True, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=True, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != padding_mask.shape[-1]: - padding_mask_num_tokens = padding_mask.shape[-1] - padding_mask = padding_mask[:, padding_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - - key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) - value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = paddle.arange( - batch_size + 1, dtype=paddle.int32 - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - padding_mask = padding_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) +#class MistralFlashAttention2(MistralAttention): +# """ +# Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays +# untouched. The only required change would be on the forward pass where it needs to correctly call the public API of +# flash attention and deal with padding tokens in case the input contains any of them. +# """ +# +# def forward( +# self, +# hidden_states: paddle.Tensor, +# attention_mask: Optional[paddle.Tensor] = None, +# position_ids: Optional[paddle.Tensor] = None, +# past_key_value: Optional[Tuple[paddle.Tensor]] = None, +# output_attentions: bool = False, +# use_cache: bool = False, +# padding_mask: Optional[paddle.Tensor] = None, +# ): +# bsz, q_len, _ = hidden_states.shape +# +# query_states = self.q_proj(hidden_states) +# key_states = self.k_proj(hidden_states) +# value_states = self.v_proj(hidden_states) +# +# query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) +# key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) +# value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) +# +# kv_seq_len = key_states.shape[-2] +# if past_key_value is not None: +# kv_seq_len += past_key_value[0].shape[-2] +# +# # Because the input can be padded, the absolute sequence length depends on the max position id. +# rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 +# cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) +# +# query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) +# +# use_sliding_windows = ( +# _flash_supports_window_size +# and hasattr(self.config, "sliding_window") is not None +# and kv_seq_len > self.config.sliding_window +# ) +# +# if not _flash_supports_window_size: +# logger.warning_once( +# "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" +# " make sure to upgrade flash-attn library." +# ) +# +# if past_key_value is not None: +# # Activate slicing cache only if the config has a value `sliding_windows` attribute +# if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window: +# slicing_tokens = kv_seq_len - self.config.sliding_window +# +# past_key = past_key_value[0] +# past_value = past_key_value[1] +# +# past_key = past_key[:, :, slicing_tokens:, :] +# past_value = past_value[:, :, slicing_tokens:, :] +# +# if past_key.shape[-2] != self.config.sliding_window - 1: +# raise ValueError( +# f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" +# f" {past_key.shape}" +# ) +# +# past_key_value = (past_key, past_value) +# +# if padding_mask is not None: +# padding_mask = padding_mask[:, slicing_tokens:] +# padding_mask = paddle.concat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], axis=-1) +# +# key_states = paddle.concat([past_key_value[0], key_states], axis=2) +# value_states = paddle.concat([past_key_value[1], value_states], axis=2) +# +# past_key_value = (key_states, value_states) if use_cache else None +# +# # repeat k/v heads if n_kv_heads < n_heads +# key_states = repeat_kv(key_states, self.num_key_value_groups) +# value_states = repeat_kv(value_states, self.num_key_value_groups) +# +# # TODO: Mistral does not have dropout in the config?? +# # It is recommended to use dropout with FA according to the docs +# # when training. +# dropout_rate = 0.0 # if not self.training else self.attn_dropout +# +# # Reashape to the expected shape for Flash Attention +# query_states = query_states.transpose([0, 2, 1, 3]) +# key_states = key_states.transpose([0, 2, 1, 3]) +# value_states = value_states.transpose([0, 2, 1, 3]) +# +# attn_output = self._flash_attention_forward( +# query_states, +# key_states, +# value_states, +# padding_mask, +# q_len, +# dropout=dropout_rate, +# use_sliding_windows=use_sliding_windows, +# ) +# +# attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) +# attn_output = self.o_proj(attn_output) +# +# if not output_attentions: +# attn_weights = None +# +# return attn_output, attn_weights, past_key_value + + #def _flash_attention_forward( + # self, + # query_states, + # key_states, + # value_states, + # padding_mask, + # query_length, + # dropout=0.0, + # softmax_scale=None, + # use_sliding_windows=False, + #): + # """ + # calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + # first unpad the input, then computes the attention scores and pad the final attention scores. + + # args: + # query_states (`paddle.Tensor`): + # Input query states to be passed to Flash Attention API + # key_states (`paddle.Tensor`): + # Input key states to be passed to Flash Attention API + # value_states (`paddle.Tensor`): + # Input value states to be passed to Flash Attention API + # padding_mask (`paddle.Tensor`): + # The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + # position of padding tokens and 1 for the position of non-padding tokens. + # dropout (`int`, *optional*): + # Attention dropout + # softmax_scale (`float`, *optional*): + # The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + # use_sliding_windows (`bool`, *optional*): + # Whether to activate sliding window attention. + # """ + # # Contains at least one padding token in the sequence + # if padding_mask is not None: + # batch_size = query_states.shape[0] + # query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + # query_states, key_states, value_states, padding_mask, query_length + # ) + + # cu_seqlens_q, cu_seqlens_k = cu_seq_lens + # max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + # if not use_sliding_windows: + # attn_output_unpad = flash_attn_varlen_func( + # query_states, + # key_states, + # value_states, + # cu_seqlens_q=cu_seqlens_q, + # cu_seqlens_k=cu_seqlens_k, + # max_seqlen_q=max_seqlen_in_batch_q, + # max_seqlen_k=max_seqlen_in_batch_k, + # dropout_p=dropout, + # softmax_scale=softmax_scale, + # causal=True, + # ) + # else: + # attn_output_unpad = flash_attn_varlen_func( + # query_states, + # key_states, + # value_states, + # cu_seqlens_q=cu_seqlens_q, + # cu_seqlens_k=cu_seqlens_k, + # max_seqlen_q=max_seqlen_in_batch_q, + # max_seqlen_k=max_seqlen_in_batch_k, + # dropout_p=dropout, + # softmax_scale=softmax_scale, + # causal=True, + # window_size=(self.config.sliding_window, self.config.sliding_window), + # ) + + # attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + # else: + # if not use_sliding_windows: + # attn_output = flash_attn_func( + # query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True + # ) + # else: + # attn_output = flash_attn_func( + # query_states, + # key_states, + # value_states, + # dropout, + # softmax_scale=softmax_scale, + # causal=True, + # window_size=(self.config.sliding_window, self.config.sliding_window), + # ) + + # return attn_output + + #def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): + # batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # # On the first iteration we need to properly re-create the padding mask + # # by slicing it on the proper place + # if kv_seq_len != padding_mask.shape[-1]: + # padding_mask_num_tokens = padding_mask.shape[-1] + # padding_mask = padding_mask[:, padding_mask_num_tokens - kv_seq_len :] + + # indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + + # key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) + # value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) + + # if query_length == kv_seq_len: + # query_layer = index_first_axis( + # query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k + # ) + # cu_seqlens_q = cu_seqlens_k + # max_seqlen_in_batch_q = max_seqlen_in_batch_k + # indices_q = indices_k + # elif query_length == 1: + # max_seqlen_in_batch_q = 1 + # cu_seqlens_q = paddle.arange( + # batch_size + 1, dtype=paddle.int32 + # ) # There is a memcpy here, that is very bad. + # indices_q = cu_seqlens_q[:-1] + # query_layer = query_layer.squeeze(1) + # else: + # # The -q_len: slice assumes left padding. + # padding_mask = padding_mask[:, -query_length:] + # query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) + + # return ( + # query_layer, + # key_layer, + # value_layer, + # indices_q, + # (cu_seqlens_q, cu_seqlens_k), + # (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + # ) class MistralDecoderLayer(nn.Layer): @@ -654,8 +668,8 @@ def __init__(self, config: MistralConfig): self.hidden_size = config.hidden_size self.self_attn = ( MistralAttention(config=config) - if not getattr(config, "_flash_attn_2_enabled", False) - else MistralFlashAttention2(config) + #if not getattr(config, "_flash_attn_2_enabled", False) + #else MistralFlashAttention2(config) ) self.mlp = MistralMLP(config) self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) From a5d3603e64b3a28c8cb3ca875c993626c703d0d9 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 13 Nov 2023 17:34:52 +0800 Subject: [PATCH 07/24] mistral sliding windows attention (attention_mask version) --- paddlenlp/transformers/mistral/__init__.py | 2 +- paddlenlp/transformers/mistral/modeling.py | 389 +++++---------------- 2 files changed, 87 insertions(+), 304 deletions(-) diff --git a/paddlenlp/transformers/mistral/__init__.py b/paddlenlp/transformers/mistral/__init__.py index 2e538f5c59f3..0b41cc3d8c54 100644 --- a/paddlenlp/transformers/mistral/__init__.py +++ b/paddlenlp/transformers/mistral/__init__.py @@ -11,5 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .modeling import MistralForCausalLM from .configuration import MistralConfig +from .modeling import MistralForCausalLM diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 8e90be58111a..ec83e184ac12 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect import math from functools import partial from typing import List, Optional, Tuple, Union import paddle +import paddle.distributed.fleet.meta_parallel as mpu import paddle.nn.functional as F from paddle import nn -from paddle.distributed.fleet.utils import recompute -import paddle.distributed.fleet.meta_parallel as mpu from paddle.distributed import fleet +from paddle.distributed.fleet.utils import recompute from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from paddlenlp.transformers.conversion_utils import ( @@ -29,41 +28,18 @@ init_name_mappings, ) from paddlenlp.utils.log import logger -#i=0 from ..activations import ACT2FN from ..model_outputs import ( BaseModelOutputWithPast, + CausalLMOutputWithCrossAttentions, CausalLMOutputWithPast, SequenceClassifierOutputWithPast, - CausalLMOutputWithCrossAttentions, ) from ..model_utils import PretrainedModel - -# from ...utils import ( -# add_start_docstrings, -# add_start_docstrings_to_model_forward, -# is_flash_attn_2_available, -# logging, -# replace_return_docstrings, -# ) from .configuration import MistralConfig -def is_flash_attn_2_available(): - return False - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - -_CONFIG_FOR_DOC = "MistralConfig" - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data def _get_unpad_data(padding_mask): seqlens_in_batch = padding_mask.sum(axis=-1, dtype=paddle.int32) indices = paddle.nonzero(padding_mask.flatten(), as_tuple=False).flatten() @@ -76,7 +52,7 @@ def _get_unpad_data(padding_mask): ) -def _make_sliding_window_causal_mask( +def _make_causal_mask( input_ids_shape: paddle.shape, dtype: paddle.dtype, past_key_values_length: int = 0, @@ -101,6 +77,40 @@ def _make_sliding_window_causal_mask( return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) +def _make_sliding_window_causal_mask( + input_ids_shape: paddle.shape, + dtype: paddle.dtype, + past_key_values_length: int = 0, + sliding_window: int = 4096, +): + """ + Make causal mask used for sliding window attention + """ + bsz, tgt_len = input_ids_shape + mask = paddle.full( + (tgt_len, tgt_len), + fill_value=0.0, + dtype="float32", + ) + if past_key_values_length > 0: + mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype="float32"), mask], axis=-1) + + # make sliding window mask + # note that: this computation of SWA only modify the mask + # to imitate sliding window which has same time complexity + # with normal attention calculation, just for test. + for qidx in range(tgt_len): + q_causal_start = past_key_values_length + qidx - sliding_window + q_causal_end = q_causal_start + sliding_window + q_causal_start = max(0, q_causal_start) + # paddle do not support index operation on bfloat16 tensor temporary + mask[qidx, q_causal_start : q_causal_end + 1] = 1.0 + + mask = paddle.log(mask).astype(dtype) + + return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) + + def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. @@ -144,12 +154,10 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base - self.inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).astype('float32') / self.dim)) + self.inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).astype("float32") / self.dim)) # Build here to make `paddle.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, dtype=paddle.get_default_dtype() - ) + self._set_cos_sin_cache(seq_len=max_position_embeddings, dtype=paddle.get_default_dtype()) def _set_cos_sin_cache(self, seq_len, dtype): self.max_seq_len_cached = seq_len @@ -337,19 +345,21 @@ def forward( padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: bsz, q_len, _ = hidden_states.shape - #global i - #import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) + # global i + # import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - #import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) - #import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) - #import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) + # import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) + # import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) + # import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) - value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) + value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose( + [0, 2, 1, 3] + ) kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -370,7 +380,7 @@ def forward( if not self.use_flash_attention: attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) - #import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) + # import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( @@ -387,12 +397,15 @@ def forward( attn_weights = attn_weights + attention_mask # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query_states.dtype) + attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype( + query_states.dtype + ) attn_output = paddle.matmul(attn_weights, value_states) else: - query_states = query_states.transpose([0,2,1,3]) - key_states = key_states.transpose([0,2,1,3]) - value_states = value_states.transpose([0,2,1,3]) + query_states = query_states.transpose([0, 2, 1, 3]) + key_states = key_states.transpose([0, 2, 1, 3]) + value_states = value_states.transpose([0, 2, 1, 3]) + # print(attention_mask) attn_output = F.scaled_dot_product_attention( query_states, key_states, @@ -400,7 +413,7 @@ def forward( attn_mask=attention_mask, is_causal=attention_mask is None, ) - attn_output = attn_output.transpose([0,2,1,3]) + attn_output = attn_output.transpose([0, 2, 1, 3]) if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]: raise ValueError( @@ -412,8 +425,8 @@ def forward( attn_output = attn_output.reshape([bsz, q_len, self.num_heads * self.head_dim]) attn_output = self.o_proj(attn_output) - #import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) - #i += 1 + # import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) + # i += 1 if not output_attentions: attn_weights = None @@ -421,255 +434,14 @@ def forward( return attn_output, attn_weights, past_key_value -#class MistralFlashAttention2(MistralAttention): -# """ -# Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays -# untouched. The only required change would be on the forward pass where it needs to correctly call the public API of -# flash attention and deal with padding tokens in case the input contains any of them. -# """ -# -# def forward( -# self, -# hidden_states: paddle.Tensor, -# attention_mask: Optional[paddle.Tensor] = None, -# position_ids: Optional[paddle.Tensor] = None, -# past_key_value: Optional[Tuple[paddle.Tensor]] = None, -# output_attentions: bool = False, -# use_cache: bool = False, -# padding_mask: Optional[paddle.Tensor] = None, -# ): -# bsz, q_len, _ = hidden_states.shape -# -# query_states = self.q_proj(hidden_states) -# key_states = self.k_proj(hidden_states) -# value_states = self.v_proj(hidden_states) -# -# query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) -# key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) -# value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) -# -# kv_seq_len = key_states.shape[-2] -# if past_key_value is not None: -# kv_seq_len += past_key_value[0].shape[-2] -# -# # Because the input can be padded, the absolute sequence length depends on the max position id. -# rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 -# cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) -# -# query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) -# -# use_sliding_windows = ( -# _flash_supports_window_size -# and hasattr(self.config, "sliding_window") is not None -# and kv_seq_len > self.config.sliding_window -# ) -# -# if not _flash_supports_window_size: -# logger.warning_once( -# "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" -# " make sure to upgrade flash-attn library." -# ) -# -# if past_key_value is not None: -# # Activate slicing cache only if the config has a value `sliding_windows` attribute -# if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window: -# slicing_tokens = kv_seq_len - self.config.sliding_window -# -# past_key = past_key_value[0] -# past_value = past_key_value[1] -# -# past_key = past_key[:, :, slicing_tokens:, :] -# past_value = past_value[:, :, slicing_tokens:, :] -# -# if past_key.shape[-2] != self.config.sliding_window - 1: -# raise ValueError( -# f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" -# f" {past_key.shape}" -# ) -# -# past_key_value = (past_key, past_value) -# -# if padding_mask is not None: -# padding_mask = padding_mask[:, slicing_tokens:] -# padding_mask = paddle.concat([padding_mask, paddle.ones_like(padding_mask[:, -1:])], axis=-1) -# -# key_states = paddle.concat([past_key_value[0], key_states], axis=2) -# value_states = paddle.concat([past_key_value[1], value_states], axis=2) -# -# past_key_value = (key_states, value_states) if use_cache else None -# -# # repeat k/v heads if n_kv_heads < n_heads -# key_states = repeat_kv(key_states, self.num_key_value_groups) -# value_states = repeat_kv(value_states, self.num_key_value_groups) -# -# # TODO: Mistral does not have dropout in the config?? -# # It is recommended to use dropout with FA according to the docs -# # when training. -# dropout_rate = 0.0 # if not self.training else self.attn_dropout -# -# # Reashape to the expected shape for Flash Attention -# query_states = query_states.transpose([0, 2, 1, 3]) -# key_states = key_states.transpose([0, 2, 1, 3]) -# value_states = value_states.transpose([0, 2, 1, 3]) -# -# attn_output = self._flash_attention_forward( -# query_states, -# key_states, -# value_states, -# padding_mask, -# q_len, -# dropout=dropout_rate, -# use_sliding_windows=use_sliding_windows, -# ) -# -# attn_output = attn_output.reshape([bsz, q_len, self.hidden_size]) -# attn_output = self.o_proj(attn_output) -# -# if not output_attentions: -# attn_weights = None -# -# return attn_output, attn_weights, past_key_value - - #def _flash_attention_forward( - # self, - # query_states, - # key_states, - # value_states, - # padding_mask, - # query_length, - # dropout=0.0, - # softmax_scale=None, - # use_sliding_windows=False, - #): - # """ - # calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - # first unpad the input, then computes the attention scores and pad the final attention scores. - - # args: - # query_states (`paddle.Tensor`): - # Input query states to be passed to Flash Attention API - # key_states (`paddle.Tensor`): - # Input key states to be passed to Flash Attention API - # value_states (`paddle.Tensor`): - # Input value states to be passed to Flash Attention API - # padding_mask (`paddle.Tensor`): - # The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - # position of padding tokens and 1 for the position of non-padding tokens. - # dropout (`int`, *optional*): - # Attention dropout - # softmax_scale (`float`, *optional*): - # The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - # use_sliding_windows (`bool`, *optional*): - # Whether to activate sliding window attention. - # """ - # # Contains at least one padding token in the sequence - # if padding_mask is not None: - # batch_size = query_states.shape[0] - # query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - # query_states, key_states, value_states, padding_mask, query_length - # ) - - # cu_seqlens_q, cu_seqlens_k = cu_seq_lens - # max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - # if not use_sliding_windows: - # attn_output_unpad = flash_attn_varlen_func( - # query_states, - # key_states, - # value_states, - # cu_seqlens_q=cu_seqlens_q, - # cu_seqlens_k=cu_seqlens_k, - # max_seqlen_q=max_seqlen_in_batch_q, - # max_seqlen_k=max_seqlen_in_batch_k, - # dropout_p=dropout, - # softmax_scale=softmax_scale, - # causal=True, - # ) - # else: - # attn_output_unpad = flash_attn_varlen_func( - # query_states, - # key_states, - # value_states, - # cu_seqlens_q=cu_seqlens_q, - # cu_seqlens_k=cu_seqlens_k, - # max_seqlen_q=max_seqlen_in_batch_q, - # max_seqlen_k=max_seqlen_in_batch_k, - # dropout_p=dropout, - # softmax_scale=softmax_scale, - # causal=True, - # window_size=(self.config.sliding_window, self.config.sliding_window), - # ) - - # attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - # else: - # if not use_sliding_windows: - # attn_output = flash_attn_func( - # query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True - # ) - # else: - # attn_output = flash_attn_func( - # query_states, - # key_states, - # value_states, - # dropout, - # softmax_scale=softmax_scale, - # causal=True, - # window_size=(self.config.sliding_window, self.config.sliding_window), - # ) - - # return attn_output - - #def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): - # batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # # On the first iteration we need to properly re-create the padding mask - # # by slicing it on the proper place - # if kv_seq_len != padding_mask.shape[-1]: - # padding_mask_num_tokens = padding_mask.shape[-1] - # padding_mask = padding_mask[:, padding_mask_num_tokens - kv_seq_len :] - - # indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - - # key_layer = index_first_axis(key_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) - # value_layer = index_first_axis(value_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k) - - # if query_length == kv_seq_len: - # query_layer = index_first_axis( - # query_layer.reshape([batch_size * kv_seq_len, num_heads, head_dim]), indices_k - # ) - # cu_seqlens_q = cu_seqlens_k - # max_seqlen_in_batch_q = max_seqlen_in_batch_k - # indices_q = indices_k - # elif query_length == 1: - # max_seqlen_in_batch_q = 1 - # cu_seqlens_q = paddle.arange( - # batch_size + 1, dtype=paddle.int32 - # ) # There is a memcpy here, that is very bad. - # indices_q = cu_seqlens_q[:-1] - # query_layer = query_layer.squeeze(1) - # else: - # # The -q_len: slice assumes left padding. - # padding_mask = padding_mask[:, -query_length:] - # query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) - - # return ( - # query_layer, - # key_layer, - # value_layer, - # indices_q, - # (cu_seqlens_q, cu_seqlens_k), - # (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - # ) - - class MistralDecoderLayer(nn.Layer): def __init__(self, config: MistralConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = ( MistralAttention(config=config) - #if not getattr(config, "_flash_attn_2_enabled", False) - #else MistralFlashAttention2(config) + # if not getattr(config, "_flash_attn_2_enabled", False) + # else MistralFlashAttention2(config) ) self.mlp = MistralMLP(config) self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -764,8 +536,8 @@ def _get_name_mappings(cls, config: MistralConfig) -> list[StateDictNameMapping] init_name_mappings(mappings=model_mappings) # base-model prefix "LlamaModel" for mapping in model_mappings: - mapping[0] = "model." + mapping[0] - mapping[1] = "model." + mapping[1] + mapping[0] = "model." + mapping[0] + mapping[1] = "model." + mapping[1] if "MistralModel" not in config.architectures: model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) @@ -853,6 +625,7 @@ def _init_weights(self, layer): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) + class MistralModel(MistralPreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] @@ -894,14 +667,23 @@ def _prepare_decoder_attention_mask( ): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - if input_shape[-1] > 1: + + q_length = input_shape[-1] + kv_length = q_length + past_key_values_length + if kv_length > sliding_window: combined_attention_mask = _make_sliding_window_causal_mask( input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length, sliding_window=sliding_window, ) + else: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + sliding_window=sliding_window, + ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] @@ -955,7 +737,7 @@ def forward( ) position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length]) else: - position_ids = position_ids.reshape([-1, seq_length]).astype('int64') + position_ids = position_ids.reshape([-1, seq_length]).astype("int64") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -964,9 +746,7 @@ def forward( # embed positions if attention_mask is None: - attention_mask = paddle.ones( - (batch_size, seq_length_with_past), dtype=paddle.bool - ) + attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool) elif paddle.any(attention_mask == 0): padding_mask = attention_mask @@ -1063,6 +843,7 @@ def custom_forward(*inputs): attentions=all_self_attns, ) + def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, tensor_parallel_output=True): is_fleet_init = True tensor_parallel_degree = 1 @@ -1118,6 +899,7 @@ def forward(self, hidden_states, tensor_parallel_output=None): logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) return logits + class MistralForCausalLM(MistralPreTrainedModel): _tied_weights_keys = ["lm_head.weight"] @@ -1125,7 +907,7 @@ def __init__(self, config): super().__init__(config) self.model = MistralModel(config) self.vocab_size = config.vocab_size - #self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) self.lm_head = MistralLMHead(config) def get_input_embeddings(self): @@ -1254,14 +1036,14 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.astype('float32') - #import numpy as np; np.save('l', logits.astype('float32').numpy()) + logits = logits.astype("float32") + # import numpy as np; np.save('l', logits.astype('float32').numpy()) loss = None if labels is not None: loss_fct = CrossEntropyLoss() - #logits = logits.reshape([-1, self.config.vocab_size]) - #labels = labels.reshape([-1]) + # logits = logits.reshape([-1, self.config.vocab_size]) + # labels = labels.reshape([-1]) loss = loss_fct(logits, labels) if not return_dict: @@ -1276,6 +1058,7 @@ def forward( attentions=outputs.attentions, ) + class MistralForSequenceClassification(MistralPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1335,7 +1118,7 @@ def forward( sequence_lengths = -1 else: if input_ids is not None: - sequence_lengths = (paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1) + sequence_lengths = paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1 else: sequence_lengths = -1 From c97d0183fb3531125ef405a958e22d1502f90dba Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 13 Nov 2023 17:45:08 +0800 Subject: [PATCH 08/24] llm json fix --- llm/mistral/lora_argument.json | 10 ++++------ llm/mistral/pt_argument.json | 2 +- llm/mistral/sft_argument.json | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/llm/mistral/lora_argument.json b/llm/mistral/lora_argument.json index f0c5d0175817..ccb3af015cba 100644 --- a/llm/mistral/lora_argument.json +++ b/llm/mistral/lora_argument.json @@ -1,6 +1,6 @@ { - "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", - "dataset_name_or_path": "./data1", + "model_name_or_path": "mistralai/Mistral-7B-v0.1", + "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_lora_ckpts", "per_device_train_batch_size": 4, "gradient_accumulation_steps": 4, @@ -10,10 +10,8 @@ "learning_rate": 3e-04, "warmup_steps": 30, "logging_steps": 1, - "evaluation_strategy": "steps", - "save_strategy": "steps", - "eval_steps": 20, - "save_steps": 60, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, "fp16": true, diff --git a/llm/mistral/pt_argument.json b/llm/mistral/pt_argument.json index eceb0f5768d1..750293f2f645 100644 --- a/llm/mistral/pt_argument.json +++ b/llm/mistral/pt_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", + "model_name_or_path": "mistralai/Mistral-7B-v0.1", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_pt_ckpts", "per_device_train_batch_size": 4, diff --git a/llm/mistral/sft_argument.json b/llm/mistral/sft_argument.json index fca6739b4af3..2a9b8b42cc26 100644 --- a/llm/mistral/sft_argument.json +++ b/llm/mistral/sft_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "/root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658/", + "model_name_or_path": "mistralai/Mistral-7B-v0.1", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_sft_ckpts", "per_device_train_batch_size": 4, From 8160bbbbbf1ec5e8d5f694d75529355f57035ea1 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 13 Nov 2023 17:57:01 +0800 Subject: [PATCH 09/24] bug fix --- paddlenlp/transformers/mistral/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index ec83e184ac12..c1ebd6103505 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -513,8 +513,8 @@ class MistralPreTrainedModel(PretrainedModel): _supports_flash_attn_2 = True @classmethod - def _get_name_mappings(cls, config: MistralConfig) -> list[StateDictNameMapping]: - mappings: list[StateDictNameMapping] = [] + def _get_name_mappings(cls, config: MistralConfig) -> List[StateDictNameMapping]: + mappings: List[StateDictNameMapping] = [] model_mappings = [ ["embed_tokens.weight"], ["norm.weight"], From 4baa81bc63fb83e7a4cb07544763d62cd8d4361d Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Thu, 16 Nov 2023 17:33:45 +0800 Subject: [PATCH 10/24] ci code save --- paddlenlp/transformers/mistral/modeling.py | 8 +- tests/transformers/mistral/test_modeling.py | 484 ++++++++++++++++++++ 2 files changed, 488 insertions(+), 4 deletions(-) create mode 100644 tests/transformers/mistral/test_modeling.py diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index c1ebd6103505..b7505f34cda5 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -733,7 +733,7 @@ def forward( if position_ids is None: position_ids = paddle.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long + past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64 ) position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length]) else: @@ -905,7 +905,7 @@ class MistralForCausalLM(MistralPreTrainedModel): def __init__(self, config): super().__init__(config) - self.model = MistralModel(config) + self.mistral = MistralModel(config) self.vocab_size = config.vocab_size # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) self.lm_head = MistralLMHead(config) @@ -1022,7 +1022,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( + outputs = self.mistral( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -1129,7 +1129,7 @@ def forward( if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == paddle.long or labels.dtype == paddle.int): + elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" diff --git a/tests/transformers/mistral/test_modeling.py b/tests/transformers/mistral/test_modeling.py new file mode 100644 index 000000000000..5a4b0316a154 --- /dev/null +++ b/tests/transformers/mistral/test_modeling.py @@ -0,0 +1,484 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import tempfile +import unittest + +import numpy as np +import paddle +from parameterized import parameterized + +from paddlenlp.transformers import MistralConfig, MistralForCausalLM, MistralModel +from tests.testing_utils import require_package, slow +from tests.transformers.test_configuration_common import ConfigTester +from tests.transformers.test_generation_utils import GenerationTesterMixin +from tests.transformers.test_modeling_common import ( + GenerationD2STestMixin, + ModelTesterMixin, + ModelTesterPretrainedMixin, + ids_tensor, + random_attention_mask, +) + + +class MistralModelTester: + def __init__( + self, + parent, + vocab_size=32000, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=8, + masked_softmax_fusion=True, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + is_training=True, + use_cache=False, + bos_token_id=1, + eos_token_id=2, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.0, + attention_dropout=0.0, + attention_softmax_in_fp32=True, + pretraining_tp=1, # TP rank used when training with megatron + dtype="bfloat16", + slow_but_exact=False, + batch_size: int = 2, + seq_length: int = 10, + type_sequence_label_size=2, + activation_function="gelu", + num_labels=3, + num_choices=4, + scope=None, + dropout=0.56, + use_input_mask: bool = False, + use_labels: bool = False, + return_dict=False, + ): + self.parent: MistralModelTest = parent + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.masked_softmax_fusion = masked_softmax_fusion + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.is_training = is_training + self.use_cache = use_cache + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.pretraining_tp = pretraining_tp + self.dtype = dtype + self.slow_but_exact = slow_but_exact + + self.batch_size = batch_size + self.seq_length = seq_length + self.type_sequence_label_size = type_sequence_label_size + self.activation_function = activation_function + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.dropout = dropout + + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.return_dict = return_dict + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype=paddle.int64) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self) -> MistralConfig: + return MistralConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + masked_softmax_fusion=self.masked_softmax_fusion, + layer_norm_epsilon=self.layer_norm_epsilon, + initializer_range=self.initializer_range, + use_cache=self.use_cache, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + apply_residual_connection_post_layernorm=self.apply_residual_connection_post_layernorm, + hidden_dropout=self.hidden_dropout, + attention_dropout=self.attention_dropout, + attention_softmax_in_fp32=self.attention_softmax_in_fp32, + pretraining_tp=self.pretraining_tp, + dtype=self.dtype, + slow_but_exact=self.slow_but_exact, + activation_function=self.activation_function, + ) + + def create_and_check_model( + self, config: MistralConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = MistralModel(config) + model.eval() + result = model(input_ids) + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) + + def create_and_check_model_attention_mask( + self, config: MistralConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = MistralModel(config) + model.eval() + attn_mask_2d = random_attention_mask([self.batch_size, self.seq_length]) + result_2d = model(input_ids, attention_mask=attn_mask_2d)[0] + batch, seq_length = input_ids.shape + causal_mask = paddle.tril(paddle.ones((batch, seq_length, seq_length), dtype=attn_mask_2d.dtype)) + attn_mask_3d = causal_mask & attn_mask_2d.unsqueeze(-1) + result_3d = model(input_ids, attention_mask=attn_mask_3d)[0] + attn_mask_4d = attn_mask_3d.unsqueeze(1) + result_4d = model(input_ids, attention_mask=attn_mask_4d)[0] + result_no_attention_mask = model(input_ids, attention_mask=None)[0] + # Assert non-padding tokens have the same logits with different attention_mask shape + self.parent.assertTrue((result_2d[attn_mask_2d] == result_3d[attn_mask_2d]).all()) + self.parent.assertTrue((result_2d[attn_mask_2d] == result_4d[attn_mask_2d]).all()) + self.parent.assertTrue((result_2d[attn_mask_2d] == result_no_attention_mask[attn_mask_2d]).all()) + + def create_and_check_model_past_large_inputs( + self, + config: MistralConfig, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = MistralModel(config) + model.eval() + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, use_cache=True, return_dict=self.return_dict) + past_key_values = outputs.past_key_values if self.return_dict else outputs[2] + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), self.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = paddle.concat([input_mask, next_mask], axis=-1) + + outputs = model( + next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True, return_dict=self.return_dict + ) + + output_from_no_past = outputs[2][0] + + outputs = model( + next_tokens, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + return_dict=self.return_dict, + ) + + output_from_past = outputs[2][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): + model = MistralForCausalLM(config) + model.eval() + + result = model( + input_ids, + use_cache=True, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + if self.parent.use_labels: + self.parent.assertIsInstance(result[0].item(), float) + self.parent.assertEqual(result[1].shape, [self.batch_size, self.seq_length, self.vocab_size]) + else: + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.vocab_size]) + + def check_model_position_ids(self, config, input_ids, input_mask, *args): + model = MistralForCausalLM(config) + model.eval() + + result_no_position_id = model( + input_ids, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + batch_size, seq_len = input_ids.shape + position_ids = paddle.arange(seq_len).expand((batch_size, seq_len)) + result_position_id = model( + input_ids, + position_ids, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + if self.parent.use_labels: + self.parent.assertTrue((result_position_id[1] == result_no_position_id[1]).all()) + else: + self.parent.assertTrue((result_position_id[0] == result_no_position_id[0]).all()) + + +class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = MistralModel + return_dict = False + use_labels = False + + all_model_classes = (MistralModel, MistralForCausalLM) + all_generative_model_classes = {MistralForCausalLM: (MistralModel, "Mistral")} + + def setUp(self): + super().setUp() + + self.model_tester = MistralModelTester(self) + self.config_tester = ConfigTester(self, config_class=MistralConfig, vocab_size=256, hidden_size=24) + + def _get_input_ids_and_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + input_ids = inputs_dict[self.input_name] + attention_mask = paddle.ones_like(input_ids, dtype=paddle.int64) + + max_batch_size = 2 + sequence_length = input_ids.shape[-1] // 2 + input_ids = input_ids[:max_batch_size, :sequence_length] + attention_mask = attention_mask[:max_batch_size, :sequence_length] + max_length = 3 + + return config, input_ids, attention_mask, max_length + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_attention_mask(*config_and_inputs) + + def test_model_position_ids(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_model_position_ids(*config_and_inputs) + + def test_generate_without_input_ids(self): + # this requires 4-D attention mask logic, which is not supported yet + pass + + def test_Mistral_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + +class MistralModelIntegrationTest(ModelTesterPretrainedMixin, unittest.TestCase): + base_model_class = MistralModel + + @slow + def test_inference_no_attention(self): + model = MistralModel.from_pretrained("__internal_testing__/tiny-random-Mistral") + model.eval() + input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with paddle.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + + expected_shape = [1, 11, 768] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [ + [ + [0.20443289, 0.18662477, -0.75216216], + [0.32803515, -0.36956733, -0.95613617], + [0.28622314, 0.07698685, -0.64143789], + ] + ] + ) + self.assertTrue(paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + @slow + def test_inference_with_attention(self): + model = MistralModel.from_pretrained("__internal_testing__/tiny-random-Mistral") + model.eval() + input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with paddle.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + + expected_shape = [1, 11, 768] + self.assertEqual(output.shape, expected_shape) + + expected_slice = paddle.to_tensor( + [ + [ + [0.20443289, 0.18662477, -0.75216216], + [0.32803515, -0.36956733, -0.95613617], + [0.28622314, 0.07698685, -0.64143789], + ] + ] + ) + self.assertTrue(paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + +class MistralCompatibilityTest(unittest.TestCase): + test_model_id = "hf-internal-testing/tiny-random-MistralModel" + + @classmethod + @require_package("transformers", "torch") + def setUpClass(cls) -> None: + from transformers import MistralConfig, MistralForCausalLM + + # when python application is done, `TemporaryDirectory` will be free + cls.torch_model_path = tempfile.TemporaryDirectory().name + config = MistralConfig(hidden_size=16, num_hidden_layers=1, num_attention_heads=2) + model = MistralForCausalLM(config) + model.save_pretrained(cls.torch_model_path) + + @require_package("transformers", "torch") + def test_Mistral_converter(self): + # 1. create commmon input + input_ids = np.random.randint(100, 200, [1, 20]) + + # 2. forward the paddle model + from paddlenlp.transformers import MistralModel + + paddle_model = MistralModel.from_pretrained(self.torch_model_path, convert_from_torch=True) + paddle_model.eval() + paddle_logit = paddle_model(paddle.to_tensor(input_ids))[0] + + # 3. forward the torch model + import torch + from transformers import MistralModel + + torch_model = MistralModel.from_pretrained(self.torch_model_path) + torch_model.eval() + torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] + + self.assertTrue( + np.allclose( + paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), + torch_logit.detach().cpu().reshape([-1])[:9].numpy(), + rtol=1e-2, + ) + ) + + @require_package("transformers", "torch") + def test_Mistral_converter_from_local_dir(self): + with tempfile.TemporaryDirectory() as tempdir: + + # 1. create commmon input + input_ids = np.random.randint(100, 200, [1, 20]) + + # 2. forward the torch model + import torch + from transformers import MistralModel + + torch_model = MistralModel.from_pretrained(self.torch_model_path) + torch_model.eval() + torch_model.save_pretrained(tempdir) + torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] + + # 2. forward the paddle model + from paddlenlp.transformers import MistralModel + + paddle_model = MistralModel.from_pretrained(tempdir, convert_from_torch=True) + paddle_model.eval() + paddle_logit = paddle_model(paddle.to_tensor(input_ids))[0] + + self.assertTrue( + np.allclose( + paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), + torch_logit.detach().cpu().reshape([-1])[:9].numpy(), + rtol=1e-2, + ) + ) + + @parameterized.expand([("MistralModel",), ("MistralForCausalLM",)]) + @require_package("transformers", "torch") + def test_Mistral_classes_from_local_dir(self, class_name, pytorch_class_name: str | None = None): + pytorch_class_name = pytorch_class_name or class_name + with tempfile.TemporaryDirectory() as tempdir: + + # 1. create commmon input + input_ids = np.random.randint(100, 200, [1, 20]) + + # 2. forward the torch model + import torch + import transformers + + torch_model_class = getattr(transformers, pytorch_class_name) + torch_model = torch_model_class.from_pretrained(self.torch_model_path) + torch_model.eval() + + torch_model.save_pretrained(tempdir) + torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] + + # 3. forward the paddle model + from paddlenlp import transformers + + paddle_model_class = getattr(transformers, class_name) + paddle_model = paddle_model_class.from_pretrained(tempdir, convert_from_torch=True) + paddle_model.eval() + + paddle_logit = paddle_model(paddle.to_tensor(input_ids), return_dict=False)[0] + + self.assertTrue( + np.allclose( + paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), + torch_logit.detach().cpu().reshape([-1])[:9].numpy(), + atol=1e-3, + ) + ) + + +if __name__ == "__main__": + unittest.main() From 436567c211b3c58d976f1d391935c05c0a672bfd Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 11:58:39 +0800 Subject: [PATCH 11/24] mistral ci, attention mask --- paddlenlp/transformers/mistral/modeling.py | 39 +++-- tests/transformers/mistral/test_modeling.py | 177 +------------------- 2 files changed, 29 insertions(+), 187 deletions(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index b7505f34cda5..72146f2309f6 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -112,13 +112,20 @@ def _make_sliding_window_causal_mask( def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.shape - tgt_len = tgt_len if tgt_len is not None else src_len + expanded_mask = mask + if len(mask.shape) == 2: + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.shape + tgt_len = tgt_len if tgt_len is not None else src_len - expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).astype(dtype) + expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).astype(dtype) + elif len(mask.shape) == 3: + """ + Expands attention_mask from `[bsz, tgt_seq_len, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + expanded_mask = mask.unsqueeze(1).astype(dtype) inverted_mask = 1.0 - expanded_mask @@ -735,7 +742,7 @@ def forward( position_ids = paddle.arange( past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64 ) - position_ids = position_ids.unsqueeze(0).reshape([-1, seq_length]) + position_ids = position_ids.unsqueeze(0).expand((batch_size, seq_length)) else: position_ids = position_ids.reshape([-1, seq_length]).astype("int64") @@ -911,10 +918,10 @@ def __init__(self, config): self.lm_head = MistralLMHead(config) def get_input_embeddings(self): - return self.model.embed_tokens + return self.mistral.embed_tokens def set_input_embeddings(self, value): - self.model.embed_tokens = value + self.mistral.embed_tokens = value def get_output_embeddings(self): return self.lm_head @@ -923,10 +930,10 @@ def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): - self.model = decoder + self.mistral = decoder def get_decoder(self): - return self.model + return self.mistral def prepare_inputs_for_generation( self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs @@ -969,10 +976,12 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1) if not is_encoder_decoder and "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = paddle.concat( - [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1 - ) + attention_mask = model_kwargs.pop("attention_mask", None) + + if attention_mask is not None and len(attention_mask.shape) == 2: + model_kwargs["attention_mask"] = paddle.concat( + [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1 + ) return model_kwargs diff --git a/tests/transformers/mistral/test_modeling.py b/tests/transformers/mistral/test_modeling.py index 5a4b0316a154..a177ba5c43fa 100644 --- a/tests/transformers/mistral/test_modeling.py +++ b/tests/transformers/mistral/test_modeling.py @@ -14,21 +14,15 @@ # limitations under the License. from __future__ import annotations -import tempfile import unittest -import numpy as np import paddle -from parameterized import parameterized from paddlenlp.transformers import MistralConfig, MistralForCausalLM, MistralModel -from tests.testing_utils import require_package, slow from tests.transformers.test_configuration_common import ConfigTester from tests.transformers.test_generation_utils import GenerationTesterMixin from tests.transformers.test_modeling_common import ( - GenerationD2STestMixin, ModelTesterMixin, - ModelTesterPretrainedMixin, ids_tensor, random_attention_mask, ) @@ -49,6 +43,7 @@ def __init__( use_cache=False, bos_token_id=1, eos_token_id=2, + pad_token_id=3, apply_residual_connection_post_layernorm=False, hidden_dropout=0.0, attention_dropout=0.0, @@ -80,6 +75,7 @@ def __init__( self.use_cache = use_cache self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout @@ -131,6 +127,7 @@ def get_config(self) -> MistralConfig: use_cache=self.use_cache, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, apply_residual_connection_post_layernorm=self.apply_residual_connection_post_layernorm, hidden_dropout=self.hidden_dropout, attention_dropout=self.attention_dropout, @@ -260,7 +257,7 @@ def check_model_position_ids(self, config, input_ids, input_mask, *args): position_ids = paddle.arange(seq_len).expand((batch_size, seq_len)) result_position_id = model( input_ids, - position_ids, + position_ids=position_ids, labels=input_ids if self.parent.use_labels else None, return_dict=self.parent.return_dict, ) @@ -274,6 +271,7 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas base_model_class = MistralModel return_dict = False use_labels = False + use_test_model_name_list = False all_model_classes = (MistralModel, MistralForCausalLM) all_generative_model_classes = {MistralForCausalLM: (MistralModel, "Mistral")} @@ -310,175 +308,10 @@ def test_model_position_ids(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_model_position_ids(*config_and_inputs) - def test_generate_without_input_ids(self): - # this requires 4-D attention mask logic, which is not supported yet - pass - def test_Mistral_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) -class MistralModelIntegrationTest(ModelTesterPretrainedMixin, unittest.TestCase): - base_model_class = MistralModel - - @slow - def test_inference_no_attention(self): - model = MistralModel.from_pretrained("__internal_testing__/tiny-random-Mistral") - model.eval() - input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) - attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - with paddle.no_grad(): - output = model(input_ids, attention_mask=attention_mask)[0] - - expected_shape = [1, 11, 768] - self.assertEqual(output.shape, expected_shape) - - expected_slice = paddle.to_tensor( - [ - [ - [0.20443289, 0.18662477, -0.75216216], - [0.32803515, -0.36956733, -0.95613617], - [0.28622314, 0.07698685, -0.64143789], - ] - ] - ) - self.assertTrue(paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) - - @slow - def test_inference_with_attention(self): - model = MistralModel.from_pretrained("__internal_testing__/tiny-random-Mistral") - model.eval() - input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) - attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - with paddle.no_grad(): - output = model(input_ids, attention_mask=attention_mask)[0] - - expected_shape = [1, 11, 768] - self.assertEqual(output.shape, expected_shape) - - expected_slice = paddle.to_tensor( - [ - [ - [0.20443289, 0.18662477, -0.75216216], - [0.32803515, -0.36956733, -0.95613617], - [0.28622314, 0.07698685, -0.64143789], - ] - ] - ) - self.assertTrue(paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) - - -class MistralCompatibilityTest(unittest.TestCase): - test_model_id = "hf-internal-testing/tiny-random-MistralModel" - - @classmethod - @require_package("transformers", "torch") - def setUpClass(cls) -> None: - from transformers import MistralConfig, MistralForCausalLM - - # when python application is done, `TemporaryDirectory` will be free - cls.torch_model_path = tempfile.TemporaryDirectory().name - config = MistralConfig(hidden_size=16, num_hidden_layers=1, num_attention_heads=2) - model = MistralForCausalLM(config) - model.save_pretrained(cls.torch_model_path) - - @require_package("transformers", "torch") - def test_Mistral_converter(self): - # 1. create commmon input - input_ids = np.random.randint(100, 200, [1, 20]) - - # 2. forward the paddle model - from paddlenlp.transformers import MistralModel - - paddle_model = MistralModel.from_pretrained(self.torch_model_path, convert_from_torch=True) - paddle_model.eval() - paddle_logit = paddle_model(paddle.to_tensor(input_ids))[0] - - # 3. forward the torch model - import torch - from transformers import MistralModel - - torch_model = MistralModel.from_pretrained(self.torch_model_path) - torch_model.eval() - torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] - - self.assertTrue( - np.allclose( - paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), - torch_logit.detach().cpu().reshape([-1])[:9].numpy(), - rtol=1e-2, - ) - ) - - @require_package("transformers", "torch") - def test_Mistral_converter_from_local_dir(self): - with tempfile.TemporaryDirectory() as tempdir: - - # 1. create commmon input - input_ids = np.random.randint(100, 200, [1, 20]) - - # 2. forward the torch model - import torch - from transformers import MistralModel - - torch_model = MistralModel.from_pretrained(self.torch_model_path) - torch_model.eval() - torch_model.save_pretrained(tempdir) - torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] - - # 2. forward the paddle model - from paddlenlp.transformers import MistralModel - - paddle_model = MistralModel.from_pretrained(tempdir, convert_from_torch=True) - paddle_model.eval() - paddle_logit = paddle_model(paddle.to_tensor(input_ids))[0] - - self.assertTrue( - np.allclose( - paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), - torch_logit.detach().cpu().reshape([-1])[:9].numpy(), - rtol=1e-2, - ) - ) - - @parameterized.expand([("MistralModel",), ("MistralForCausalLM",)]) - @require_package("transformers", "torch") - def test_Mistral_classes_from_local_dir(self, class_name, pytorch_class_name: str | None = None): - pytorch_class_name = pytorch_class_name or class_name - with tempfile.TemporaryDirectory() as tempdir: - - # 1. create commmon input - input_ids = np.random.randint(100, 200, [1, 20]) - - # 2. forward the torch model - import torch - import transformers - - torch_model_class = getattr(transformers, pytorch_class_name) - torch_model = torch_model_class.from_pretrained(self.torch_model_path) - torch_model.eval() - - torch_model.save_pretrained(tempdir) - torch_logit = torch_model(torch.tensor(input_ids), return_dict=False)[0] - - # 3. forward the paddle model - from paddlenlp import transformers - - paddle_model_class = getattr(transformers, class_name) - paddle_model = paddle_model_class.from_pretrained(tempdir, convert_from_torch=True) - paddle_model.eval() - - paddle_logit = paddle_model(paddle.to_tensor(input_ids), return_dict=False)[0] - - self.assertTrue( - np.allclose( - paddle_logit.detach().cpu().reshape([-1])[:9].numpy(), - torch_logit.detach().cpu().reshape([-1])[:9].numpy(), - atol=1e-3, - ) - ) - - if __name__ == "__main__": unittest.main() From c4e66e13f2704146599b411965d839b3f453914c Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 12:38:23 +0800 Subject: [PATCH 12/24] remove redundant comment --- .../transformers/mistral/configuration.py | 75 ------------------- paddlenlp/transformers/mistral/modeling.py | 12 --- 2 files changed, 87 deletions(-) diff --git a/paddlenlp/transformers/mistral/configuration.py b/paddlenlp/transformers/mistral/configuration.py index 2340e62ca3a9..11237e5c840a 100644 --- a/paddlenlp/transformers/mistral/configuration.py +++ b/paddlenlp/transformers/mistral/configuration.py @@ -15,83 +15,8 @@ from ..configuration_utils import PretrainedConfig -MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", - "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", -} - class MistralConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an - Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. - - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MistralModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 14336): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to `4096*32`): - The maximum sequence length that this model might ever be used with. Mistral's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - The id of the padding token. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 2): - The id of the "end-of-sequence" token. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention window size. If not specified, will default to `4096`. - - - ```python - >>> from transformers import MistralModel, MistralConfig - - >>> # Initializing a Mistral 7B style configuration - >>> configuration = MistralConfig() - - >>> # Initializing a model from the Mistral 7B style configuration - >>> model = MistralModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "mistral" keys_to_ignore_at_inference = ["past_key_values"] diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 72146f2309f6..736923107f25 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -153,7 +153,6 @@ def forward(self, hidden_states): return self.weight * hidden_states.astype(input_dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral class MistralRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() @@ -352,15 +351,10 @@ def forward( padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: bsz, q_len, _ = hidden_states.shape - # global i - # import numpy as np; np.save('hs_{}'.format(i), hidden_states.astype('float32').numpy()) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - # import numpy as np; np.save('q_{}'.format(i), query_states.astype('float32').numpy()) - # import numpy as np; np.save('k_{}'.format(i), key_states.astype('float32').numpy()) - # import numpy as np; np.save('v_{}'.format(i), value_states.astype('float32').numpy()) query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3]) @@ -387,7 +381,6 @@ def forward( if not self.use_flash_attention: attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) - # import numpy as np; np.save('aw_{}'.format(i), attn_weights.astype('float32').numpy()) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: raise ValueError( @@ -432,8 +425,6 @@ def forward( attn_output = attn_output.reshape([bsz, q_len, self.num_heads * self.head_dim]) attn_output = self.o_proj(attn_output) - # import numpy as np; np.save('ao_{}'.format(i), attn_output.astype('float32').numpy()) - # i += 1 if not output_attentions: attn_weights = None @@ -1046,13 +1037,10 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits.astype("float32") - # import numpy as np; np.save('l', logits.astype('float32').numpy()) loss = None if labels is not None: loss_fct = CrossEntropyLoss() - # logits = logits.reshape([-1, self.config.vocab_size]) - # labels = labels.reshape([-1]) loss = loss_fct(logits, labels) if not return_dict: From e6768a6b629a4af0fcda9c0476b143ebbcdb74e3 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 15:00:57 +0800 Subject: [PATCH 13/24] remove print --- paddlenlp/generation/utils.py | 1 - paddlenlp/transformers/mistral/modeling.py | 1 - 2 files changed, 2 deletions(-) diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py index df8e0f0caae6..fe8262687778 100644 --- a/paddlenlp/generation/utils.py +++ b/paddlenlp/generation/utils.py @@ -915,7 +915,6 @@ def generate( stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - print(generation_config.decode_strategy) if generation_config.decode_strategy == "greedy_search": if generation_config.num_return_sequences > 1: raise ValueError( diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 736923107f25..448b7f3d3898 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -405,7 +405,6 @@ def forward( query_states = query_states.transpose([0, 2, 1, 3]) key_states = key_states.transpose([0, 2, 1, 3]) value_states = value_states.transpose([0, 2, 1, 3]) - # print(attention_mask) attn_output = F.scaled_dot_product_attention( query_states, key_states, From b6024b9d86e759df195161db47ed68d06e2f370f Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 17:02:29 +0800 Subject: [PATCH 14/24] empty commit From 8d3a702a902cb20e240a310d603d96d8cad9cae6 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 17:18:35 +0800 Subject: [PATCH 15/24] update --- paddlenlp/transformers/mistral/modeling.py | 36 ++-------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 448b7f3d3898..9283e5e37bea 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -68,7 +68,6 @@ def _make_causal_mask( fill_value=1, ) mask = paddle.tril(tensor, diagonal=0) - # make the mask banded to account for sliding window mask = paddle.triu(mask, diagonal=-sliding_window) mask = paddle.log(mask).astype(dtype) @@ -435,11 +434,7 @@ class MistralDecoderLayer(nn.Layer): def __init__(self, config: MistralConfig): super().__init__() self.hidden_size = config.hidden_size - self.self_attn = ( - MistralAttention(config=config) - # if not getattr(config, "_flash_attn_2_enabled", False) - # else MistralFlashAttention2(config) - ) + self.self_attn = MistralAttention(config=config) self.mlp = MistralMLP(config) self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -531,10 +526,9 @@ def _get_name_mappings(cls, config: MistralConfig) -> List[StateDictNameMapping] model_mappings.extend(layer_mappings) init_name_mappings(mappings=model_mappings) - # base-model prefix "LlamaModel" for mapping in model_mappings: mapping[0] = "model." + mapping[0] - mapping[1] = "model." + mapping[1] + mapping[1] = "mistral." + mapping[1] if "MistralModel" not in config.architectures: model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) @@ -904,7 +898,6 @@ def __init__(self, config): super().__init__(config) self.mistral = MistralModel(config) self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) self.lm_head = MistralLMHead(config) def get_input_embeddings(self): @@ -988,31 +981,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( From fafbb5bf6544ae0c18b3be90e8a79f57c33b4487 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 17 Nov 2023 18:08:45 +0800 Subject: [PATCH 16/24] add init py --- tests/transformers/mistral/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/transformers/mistral/__init__.py diff --git a/tests/transformers/mistral/__init__.py b/tests/transformers/mistral/__init__.py new file mode 100644 index 000000000000..595add0aed9e --- /dev/null +++ b/tests/transformers/mistral/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From f02a644c4a99976b3054f778628db8442fe9736f Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 28 Jun 2024 14:45:03 +0800 Subject: [PATCH 17/24] bugfix --- paddlenlp/transformers/mistral/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 9283e5e37bea..33f56771676d 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -661,7 +661,7 @@ def _prepare_decoder_attention_mask( q_length = input_shape[-1] kv_length = q_length + past_key_values_length - if kv_length > sliding_window: + if sliding_window and kv_length > sliding_window: combined_attention_mask = _make_sliding_window_causal_mask( input_shape, inputs_embeds.dtype, From 6d7415756dcc931e63d3f235feec9bcbf7d0f21c Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 1 Jul 2024 16:16:38 +0800 Subject: [PATCH 18/24] remove swa strategy --- paddlenlp/transformers/mistral/modeling.py | 165 +-------------------- 1 file changed, 7 insertions(+), 158 deletions(-) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 33f56771676d..e71f9a6d39d1 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -21,7 +21,7 @@ from paddle import nn from paddle.distributed import fleet from paddle.distributed.fleet.utils import recompute -from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from paddle.nn import CrossEntropyLoss from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping, @@ -34,7 +34,6 @@ BaseModelOutputWithPast, CausalLMOutputWithCrossAttentions, CausalLMOutputWithPast, - SequenceClassifierOutputWithPast, ) from ..model_utils import PretrainedModel from .configuration import MistralConfig @@ -56,7 +55,6 @@ def _make_causal_mask( input_ids_shape: paddle.shape, dtype: paddle.dtype, past_key_values_length: int = 0, - sliding_window: int = 4096, ): """ Make causal mask used for sliding window attention @@ -68,7 +66,6 @@ def _make_causal_mask( fill_value=1, ) mask = paddle.tril(tensor, diagonal=0) - mask = paddle.triu(mask, diagonal=-sliding_window) mask = paddle.log(mask).astype(dtype) if past_key_values_length > 0: @@ -76,40 +73,6 @@ def _make_causal_mask( return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) -def _make_sliding_window_causal_mask( - input_ids_shape: paddle.shape, - dtype: paddle.dtype, - past_key_values_length: int = 0, - sliding_window: int = 4096, -): - """ - Make causal mask used for sliding window attention - """ - bsz, tgt_len = input_ids_shape - mask = paddle.full( - (tgt_len, tgt_len), - fill_value=0.0, - dtype="float32", - ) - if past_key_values_length > 0: - mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype="float32"), mask], axis=-1) - - # make sliding window mask - # note that: this computation of SWA only modify the mask - # to imitate sliding window which has same time complexity - # with normal attention calculation, just for test. - for qidx in range(tgt_len): - q_causal_start = past_key_values_length + qidx - sliding_window - q_causal_end = q_causal_start + sliding_window - q_causal_start = max(0, q_causal_start) - # paddle do not support index operation on bfloat16 tensor temporary - mask[qidx, q_causal_start : q_causal_end + 1] = 1.0 - - mask = paddle.log(mask).astype(dtype) - - return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length]) - - def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None): expanded_mask = mask if len(mask.shape) == 2: @@ -653,28 +616,15 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embed_tokens = value - def _prepare_decoder_attention_mask( - self, attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window - ): + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - q_length = input_shape[-1] - kv_length = q_length + past_key_values_length - if sliding_window and kv_length > sliding_window: - combined_attention_mask = _make_sliding_window_causal_mask( - input_shape, - inputs_embeds.dtype, - past_key_values_length=past_key_values_length, - sliding_window=sliding_window, - ) - else: - combined_attention_mask = _make_causal_mask( - input_shape, - inputs_embeds.dtype, - past_key_values_length=past_key_values_length, - sliding_window=sliding_window, - ) + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] @@ -760,7 +710,6 @@ def forward( (batch_size, seq_length), inputs_embeds, past_key_values_length, - sliding_window=self.config.sliding_window, ) hidden_states = inputs_embeds @@ -1021,103 +970,3 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - -class MistralForSequenceClassification(MistralPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = MistralModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias_attr=False) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def forward( - self, - input_ids: paddle.Tensor = None, - attention_mask: Optional[paddle.Tensor] = None, - position_ids: Optional[paddle.Tensor] = None, - past_key_values: Optional[List[paddle.Tensor]] = None, - inputs_embeds: Optional[paddle.Tensor] = None, - labels: Optional[paddle.Tensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - sequence_lengths = paddle.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1 - else: - sequence_lengths = -1 - - pooled_logits = logits[paddle.arange(batch_size), sequence_lengths] - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.reshape([-1, self.num_labels]), labels.reshape([-1])) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) From c52f2039753f7f3cb33510f8e63cc64122bfe11b Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 1 Jul 2024 16:41:16 +0800 Subject: [PATCH 19/24] update --- llm/utils.py | 595 --------------------------------------------------- 1 file changed, 595 deletions(-) delete mode 100644 llm/utils.py diff --git a/llm/utils.py b/llm/utils.py deleted file mode 100644 index 89b805193343..000000000000 --- a/llm/utils.py +++ /dev/null @@ -1,595 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import glob -import math -import os -import struct -from typing import Dict, Optional - -import numpy as np -import paddle -import paddle.distributed as dist -from paddle.distributed import fleet -from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler -from sklearn.metrics import accuracy_score - -from paddlenlp.datasets import InTokensIterableDataset -from paddlenlp.trainer import Trainer, TrainerCallback -from paddlenlp.trainer.trainer_utils import IterableDatasetShard, has_length -from paddlenlp.transformers import LlamaForCausalLMPipe -from paddlenlp.utils.log import logger - - -def compute_metrics(eval_preds): - - flattened_preds = np.array(eval_preds.predictions).flatten() - flattened_labels = np.array(eval_preds.label_ids).flatten() - filtered_preds = flattened_preds[flattened_labels != -100] - filtered_labels = flattened_labels[flattened_labels != -100] - accuracy = accuracy_score(y_true=filtered_labels, y_pred=filtered_preds) - return { - "accuracy": accuracy, - } - - -def get_prefix_tuning_params(model): - if model.base_model_prefix == "chatglm": - from paddlenlp.peft.prefix import chatglm_postprocess_past_key_value - - num_attention_heads = model.config.num_attention_heads - num_hidden_layers = model.config.num_hidden_layers - hidden_size = model.config.hidden_size - postprocess_past_key_value = chatglm_postprocess_past_key_value - multi_query_group_num = None - elif model.base_model_prefix == "chatglm_v2": - from paddlenlp.peft.prefix import chatglm_postprocess_past_key_value - - num_attention_heads = model.config.num_attention_heads - num_hidden_layers = model.config.num_layers - hidden_size = model.config.hidden_size - postprocess_past_key_value = chatglm_postprocess_past_key_value - multi_query_group_num = model.config.multi_query_group_num - elif model.base_model_prefix == "bloom": - from paddlenlp.peft.prefix import bloom_postprocess_past_key_value - - num_attention_heads = model.config.num_attention_heads - num_hidden_layers = model.config.n_layer - hidden_size = model.config.n_embed - postprocess_past_key_value = bloom_postprocess_past_key_value - multi_query_group_num = None - elif model.base_model_prefix == "llama": - from paddlenlp.peft.prefix import llama_postprocess_past_key_value - - num_attention_heads = model.config.n_head - num_hidden_layers = model.config.n_layer - hidden_size = model.config.hidden_size - postprocess_past_key_value = llama_postprocess_past_key_value - multi_query_group_num = None - elif model.base_model_prefix == "qwen": - from paddlenlp.peft.prefix import qwen_postprocess_past_key_value - - num_attention_heads = model.config.num_attention_heads - num_hidden_layers = model.config.num_hidden_layers - hidden_size = model.config.hidden_size - postprocess_past_key_value = qwen_postprocess_past_key_value - multi_query_group_num = None - elif model.base_model_prefix == "mistral": - from paddlenlp.peft.prefix import llama_postprocess_past_key_value - - num_attention_heads = model.config.num_attention_heads - num_hidden_layers = model.config.num_hidden_layers - hidden_size = model.config.hidden_size - postprocess_past_key_value = llama_postprocess_past_key_value - multi_query_group_num = model.config.num_attention_heads // model.config.num_key_value_heads - else: - raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}. ") - return dict( - num_attention_heads=num_attention_heads, - num_hidden_layers=num_hidden_layers, - hidden_size=hidden_size, - postprocess_past_key_value=postprocess_past_key_value, - multi_query_group_num=multi_query_group_num, - ) - - -def get_lora_target_modules(model): - # Not yet support RowParallelLinear - if model.base_model_prefix == "chatglm": - target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"] - elif model.base_model_prefix == "chatglm_v2": - target_modules = [ - ".*query.*", - ".*key.*", - ".*value.*", - ".*dense.*", - ".*dense_h_to_4h.*", - ".*dense_4h_to_h.*", - ] - elif model.base_model_prefix == "bloom": - target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"] - elif model.base_model_prefix == "llama" or model.base_model_prefix == "mistral" or isinstance(model, LlamaForCausalLMPipe): - target_modules = [ - ".*q_proj.*", - ".*v_proj.*", - ".*k_proj.*", - ".*o_proj.*", - ".*gate_proj.*", - ".*down_proj.*", - ".*up_proj.*", - ] - elif model.base_model_prefix == "opt": - target_modules = [ - ".*project_in.*", - ".*project_out.*", - ".*q_proj.*", - ".*k_proj.*", - ".*v_proj.*", - ".*qkv_proj.*", - ".*out_proj.*", - ".*linear1.*", - ".*linear2.*", - ] - elif model.base_model_prefix == "qwen": - target_modules = [ - ".*attn.c_attn.*", - ".*attn.c_proj.*", - ".*mlp.w1.*", - ".*mlp.w2.*", - ".*mlp.c_proj.*", - ] - else: - raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}.") - return target_modules - - -class InTokensIterDatasetCallback(TrainerCallback): - """ - A [`TrainerCallback`] that handles early stopping. - - """ - - def on_step_end(self, args, state, control, **kwargs): - train_dataloader = kwargs["train_dataloader"] - if isinstance(train_dataloader.dataset, InTokensIterableDataset): - dataset = train_dataloader.dataset - elif isinstance(train_dataloader.dataset, IterableDatasetShard) and isinstance( - train_dataloader.dataset.dataset, InTokensIterableDataset - ): - dataset = train_dataloader.dataset.dataset - else: - raise ValueError( - "Unexpected dataset format: InTokensIterDatasetCallback expectes `paddlenlp.datasets.InTokensIterableDataset`" - ) - if state.trial_params is None: - state.trial_params = {} - state.trial_params["intokens_global_step"] = dataset.intokens_global_step - - -class CausalLMTrainer(Trainer): - def __init__(self, do_generation: bool, gen_args, data_args, **kwargs): - super().__init__(**kwargs) - self.do_generation = do_generation - self.gen_args = gen_args - self.data_args = data_args - - def prediction_step( - self, - model, - inputs, - prediction_loss_only: bool, - ignore_keys=None, - ): - if prediction_loss_only or self.args.pipeline_parallel_degree > 1: - return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) - elif not self.do_generation: - loss, logits, labels = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys) - # argmax here to avoid gather all logits, which is too memory-consuming. - # keepdim in order to maintain the same shape as logits - if isinstance(logits, (list, tuple)): - logits = logits[0] - return (loss, logits.argmax(axis=-1, keepdim=True), labels) - - loss = None - - model.eval() - with paddle.no_grad(): - generated_tokens = model.generate( - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None, - position_ids=inputs["position_ids"] if "position_ids" in inputs else None, - max_length=max(self.data_args.max_length - inputs["input_ids"].shape[-1], 1), - decode_strategy="sampling", - top_k=self.gen_args.top_k, - top_p=self.gen_args.top_p, - bos_token_id=self.tokenizer.bos_token_id, - eos_token_id=self.tokenizer.eos_token_id, - pad_token_id=self.tokenizer.pad_token_id, - use_cache=True, - )[0] - all_preds = [] - for pred_tokens in generated_tokens: - pred_tokens = pred_tokens[pred_tokens != self.tokenizer.pad_token_id].tolist() - all_preds.append(pred_tokens) - max_pred_length = max([len(x) for x in all_preds]) - for index, preds in enumerate(all_preds): - all_preds[index] = preds + [-100] * (max_pred_length - len(preds)) - all_preds = paddle.to_tensor(all_preds) - - if "labels" in inputs: - all_labels = paddle.to_tensor(inputs["labels"]) - else: - all_labels = None - - return (loss, all_preds, all_labels) - - def log(self, logs: Dict[str, float], **kwargs) -> None: - if "loss" in logs: - logs["ppl"] = np.exp(logs["loss"]) - if "eval_loss" in logs: - logs["eval_ppl"] = np.exp(logs["eval_loss"]) - - super(CausalLMTrainer, self).log(logs, **kwargs) - - def get_ptq_dataloader(self, ptq_ds): - if self.args.world_size <= 1: - ptq_sampler = BatchSampler( - dataset=ptq_ds, - shuffle=True, - batch_size=self.args.per_device_train_batch_size, - drop_last=self.args.dataloader_drop_last, - ) - else: - ptq_sampler = DistributedBatchSampler( - self.train_dataset, - batch_size=self.args.per_device_train_batch_size, - shuffle=True, - num_replicas=self.args.dataset_world_size, - rank=self.args.dataset_rank, - drop_last=self.args.dataloader_drop_last, - ) - ptq_dataloader = DataLoader( - ptq_ds, - batch_sampler=ptq_sampler, - collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, - ) - return ptq_dataloader - - def ptq_loop( - self, - dataloader: DataLoader, - description: str, - max_eval_iters: Optional[int] = -1, - ): - if isinstance(dataloader, paddle.io.DataLoader): - batch_size = dataloader.batch_sampler.batch_size - else: - raise ValueError("Only support for paddle.io.DataLoader") - - if has_length(dataloader): - logger.info(f" Num examples = {self.num_examples(dataloader)}") - if max_eval_iters > 0: - logger.info(f" Total {description} steps = {max_eval_iters}") - else: - logger.info(f" Total {description} steps = {len(dataloader)}") - else: - logger.info(" Num examples: Unknown") - if max_eval_iters > 0: - logger.info(f" Total {description} steps = {max_eval_iters}") - - logger.info(f" Pre device batch size = {batch_size}") - logger.info(f" Total Batch size = {batch_size * self.args.dataset_world_size}") - self.model.eval() - with paddle.no_grad(): - for step, inputs in enumerate(dataloader): - self.prediction_step(model=self.model, inputs=inputs, prediction_loss_only=True, ignore_keys=None) - if max_eval_iters > 0 and step >= max_eval_iters - 1: - break - - -def get_infer_model_path(input_dir, model_prefix): - if dist.get_world_size() > 1: - local_rank = dist.get_rank() - return os.path.join(input_dir, "rank_{}".format(local_rank), model_prefix) - else: - return os.path.join(input_dir, model_prefix) - - -def generate_rank_mapping(output_filename): - ring_id = -1 - try: - hcg = fleet.get_hybrid_communicate_group() - model_parallel_group = hcg.get_model_parallel_group() - ring_id = model_parallel_group.id - except Exception: - pass - - if ring_id == -1: - return - - world_size = dist.get_world_size() - with open(output_filename, "w") as f: - f.write("[ring_id -> ranks]\n") - f.write(",".join(map(str, [0] + list(range(world_size)))) + "\n") - f.write(",".join(map(str, [ring_id] + list(range(world_size)))) + "\n") - - f.write("[rank -> ring_ids]\n") - for i in range(world_size): - f.write("{},0,{}\n".format(i, ring_id)) - - -def deserialize_from_file(fp): - x_type = fp.read(1) - x_type_out = struct.unpack("c", x_type)[0] - # data - data_list = [] - if x_type_out == b"0": - data = fp.read(4) - data_out = struct.unpack("f", data)[0] - while data: - data_out = struct.unpack("f", data)[0] - data_list.append(data_out) - data = fp.read(4) - elif x_type_out == b"1": - data = fp.read(8) - while data: - data_out = struct.unpack("l", data)[0] - data_list.append(data_out) - data = fp.read(8) - elif x_type_out == b"2": - data = fp.read(4) - while data: - data_out = struct.unpack("i", data)[0] - data_list.append(data_out) - data = fp.read(4) - else: - print("type error") - data_arr = np.array(data_list) - return data_arr - - -def get_alibi_slopes(num_heads): - closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) - base = 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))) - powers = np.arange(1, 1 + closest_power_of_2) - slopes = np.power(base, powers) - - if closest_power_of_2 != num_heads: - extra_base = 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))) - num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) - extra_powers = np.arange(1, 1 + 2 * num_remaining_heads, 2) - slopes = np.concatante([slopes, np.power(extra_base, extra_powers)], axis=0) - - return slopes.astype("float32") - - -def pad_batch_data(insts, pad_id=0, return_seq_len=False, pad_style="right"): - """Pad sequences to the max sequence length in batch.""" - max_len = max(map(len, insts)) - if pad_style == "left": - inst_data = np.array([[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]) - else: - inst_data = np.array([list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]) - - if return_seq_len: - seq_len = np.array([len(inst) for inst in insts]) - return inst_data.astype("int64").reshape([-1, max_len]), seq_len - else: - return inst_data.astype("int64").reshape([-1, max_len]) - - -def dybatch_preprocess( - tokenizer, - texts: list[str], - src_length: int, - max_length: int, - architectures: str, - top_p: float, - temperature: float, - pre_caches_length: int = 0, - benchmark: bool = False, -): - """Pre-process generation inputs.""" - inputs = {} - if "chatglmforcausallm" == architectures.lower(): - input_ids = [] - position_ids = [] - - for text in texts: - tokens = tokenizer(text, return_tensors="np", padding=True, max_length=src_length) - input_ids.append(tokens["input_ids"][0]) - position_ids.append(tokens["position_ids"][0]) - - pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][0] - inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True) - bs = inputs["input_ids"].shape[0] - max_len = max(map(len, input_ids)) - - inst_data_pos = [] - for i in range(len(position_ids)): - inst_data_pos.append(np.array([list(inst) + [0] * (max_len - len(inst)) for inst in position_ids[i]])) - inputs["position_ids"] = paddle.to_tensor(np.array(inst_data_pos)) - elif "gpt" in architectures: - input_ids = [] - if isinstance(texts, str): - texts = [texts] - - for text in texts: - tokens = tokenizer( - text, - return_tensors="np", - padding=False, - max_length=src_length, - return_attention_mask=False, - return_token_type_ids=False, - ) - input_ids.append(tokens["input_ids"][0]) - - pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1] - inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True) - bs = inputs["input_ids"].shape[0] - max_len = max(map(len, input_ids)) - - position_ids = paddle.arange(sum(seq_len), dtype="int64") - pre_len = seq_len[0] - for length in seq_len[1:]: - position_ids[pre_len : length + pre_len] = position_ids[pre_len : length + pre_len] - pre_len - pre_len += length - inputs["position_ids"] = position_ids - else: - input_ids = [] - if isinstance(texts, str): - texts = [texts] - - for text in texts: - tokens = tokenizer( - text, - return_tensors="np", - padding=False, - max_length=src_length, - return_attention_mask=False, - return_token_type_ids=False, - ) - input_ids.append(tokens["input_ids"][0]) - - pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1] - inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True) - bs = inputs["input_ids"].shape[0] - max_len = max(map(len, input_ids)) - - position_ids = paddle.zeros(shape=[bs, max_length + src_length], dtype="int64") - - for i in range(bs): - position_ids[i, pre_caches_length : pre_caches_length + seq_len[i]] = paddle.arange(seq_len[i]) - inputs["position_ids"] = position_ids - - tgt_ids = [input[-1:] for input in input_ids] - tgt_pos = [] - for i, valid_len in enumerate(map(len, input_ids)): - tgt_pos.append(valid_len - 1) - - step_idx = [ - 0, - ] * bs - tgt_pos = np.array(tgt_pos).astype("int64") - inputs["eos_token_id"] = ( - np.array( - [ - tokenizer.eos_token_id, - ] - * bs - ) - .reshape(-1, 1) - .astype("int64") - ) - inputs["top_p"] = ( - np.array( - [ - top_p, - ] - * bs - ) - .reshape(-1, 1) - .astype("float32") - ) - inputs["temperature"] = ( - np.array( - [ - temperature, - ] - * bs - ) - .reshape(-1, 1) - .astype("float32") - ) - inputs["seq_len_encoder"] = seq_len.astype("int32").reshape(-1, 1) - inputs["seq_len_decoder"] = (seq_len + pre_caches_length).astype("int32").reshape(-1, 1) - inputs["step_idx"] = np.array(step_idx).astype("int64").reshape(-1, 1) - inputs["tgt_ids"] = np.array(tgt_ids).astype("int64").reshape(-1, 1) - inputs["tgt_pos"] = tgt_pos.reshape(-1, 1) - inputs["max_length"] = np.array(max_length - pre_caches_length).astype("int64").reshape((-1, 1)) - inputs["min_length"] = ( - np.array( - [ - 1 - if not benchmark - else max_length - - pre_caches_length, # Note(Zhengzekang): When in benchmark mode, we need to set a fixed decode length. - ] - * bs - ) - .astype("int64") - .reshape((-1, 1)) - ) - inputs["penalty_score"] = ( - np.array( - [ - 1.0, - ] - * bs - ) - .astype("float32") - .reshape((-1, 1)) - ) - inputs["frequency_score"] = ( - np.array( - [ - 0.0, - ] - * bs - ) - .astype("float32") - .reshape((-1, 1)) - ) - inputs["presence_score"] = ( - np.array( - [ - 0.0, - ] - * bs - ) - .astype("float32") - .reshape((-1, 1)) - ) - inputs["stop_flags"] = ( - np.array( - [ - 0, - ] - * bs - ) - .astype("bool") - .reshape((-1, 1)) - ) - inputs["stop_nums"] = np.array([bs]).astype("int64") - return inputs - - -def load_real_time_tokens(): - tokens = [] - files = glob.glob(os.path.join("./real_time_save.*")) - for j in range(1, len(files) + 1): - filename = "./real_time_save.temp_ids_rank_0_step_{}".format(j) - if not os.path.exists(filename): - break - fp = open(filename, "rb+") - fp.read(1) - data_list = deserialize_from_file(fp) - fp.close() - tokens.append(np.array(data_list).reshape(-1, 1)) - os.system("rm -f ./real_time_save.temp_ids_rank_*") - tokens = np.concatenate(tokens, axis=1) - return tokens From 5e312870f3deec9c995ea8d6009eb2af0c1f0c3c Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 1 Jul 2024 19:03:12 +0800 Subject: [PATCH 20/24] lora support --- llm/utils/data.py | 1 + llm/utils/utils.py | 12 ++++++++++++ paddlenlp/transformers/mistral/modeling.py | 16 ---------------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/llm/utils/data.py b/llm/utils/data.py index a97e08d926c7..f9c48f8a58a7 100644 --- a/llm/utils/data.py +++ b/llm/utils/data.py @@ -50,6 +50,7 @@ def get_convert_example(model): "opt", "qwen", "mixtral", + "mistral", "gemma", "qwen2", "qwen2_moe", diff --git a/llm/utils/utils.py b/llm/utils/utils.py index 2f51711b496b..4389ab15d0e1 100644 --- a/llm/utils/utils.py +++ b/llm/utils/utils.py @@ -190,6 +190,17 @@ def get_lora_target_modules(model): ".*w2.*", ".*w3.*", ] + elif model.base_model_prefix == "mistral": + target_modules = [ + ".*q_proj.*", + ".*k_proj.*", + ".*v_proj.*", + ".*o_proj.*", + ".*gate.*", + ".*w1.*", + ".*w2.*", + ".*w3.*", + ] elif model.base_model_prefix == "qwen2_moe": target_modules = [ ".*q_proj.*", @@ -279,6 +290,7 @@ def prediction_step( )[0] all_preds = [] for pred_tokens in generated_tokens: + pred_tokens = pred_tokens.numpy() pred_tokens = pred_tokens[pred_tokens != self.tokenizer.pad_token_id].tolist() all_preds.append(pred_tokens) max_pred_length = max([len(x) for x in all_preds]) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index e71f9a6d39d1..1646d64f0229 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -39,18 +39,6 @@ from .configuration import MistralConfig -def _get_unpad_data(padding_mask): - seqlens_in_batch = padding_mask.sum(axis=-1, dtype=paddle.int32) - indices = paddle.nonzero(padding_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(paddle.cumsum(seqlens_in_batch, axis=0, dtype=paddle.paddle.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - def _make_causal_mask( input_ids_shape: paddle.shape, dtype: paddle.dtype, @@ -462,10 +450,6 @@ def forward( class MistralPreTrainedModel(PretrainedModel): config_class = MistralConfig base_model_prefix = "mistral" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True @classmethod def _get_name_mappings(cls, config: MistralConfig) -> List[StateDictNameMapping]: From 4705a60e9736d22e7dc2dff146447a740bb0fab6 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Mon, 1 Jul 2024 21:21:30 +0800 Subject: [PATCH 21/24] update --- llm/utils/utils.py | 8 ++++++ paddlenlp/peft/prefix/__init__.py | 1 + paddlenlp/peft/prefix/utils.py | 6 +++++ paddlenlp/transformers/mistral/modeling.py | 30 ++++++++++++++++++++++ 4 files changed, 45 insertions(+) diff --git a/llm/utils/utils.py b/llm/utils/utils.py index 4389ab15d0e1..65254f002167 100644 --- a/llm/utils/utils.py +++ b/llm/utils/utils.py @@ -85,6 +85,14 @@ def get_prefix_tuning_params(model): hidden_size = model.config.hidden_size postprocess_past_key_value = llama_postprocess_past_key_value multi_query_group_num = None + elif model.base_model_prefix == "mistral": + from paddlenlp.peft.prefix import mistral_postprocess_past_key_value + + num_attention_heads = model.config.num_attention_heads + num_hidden_layers = model.config.num_hidden_layers + hidden_size = model.config.hidden_size + postprocess_past_key_value = mistral_postprocess_past_key_value + multi_query_group_num = model.config.num_key_value_heads elif model.base_model_prefix == "qwen": from paddlenlp.peft.prefix import qwen_postprocess_past_key_value diff --git a/paddlenlp/peft/prefix/__init__.py b/paddlenlp/peft/prefix/__init__.py index 90edebf53b66..0b7703a97df4 100644 --- a/paddlenlp/peft/prefix/__init__.py +++ b/paddlenlp/peft/prefix/__init__.py @@ -19,4 +19,5 @@ chatglm_postprocess_past_key_value, llama_postprocess_past_key_value, qwen_postprocess_past_key_value, + mistral_postprocess_past_key_value, ) diff --git a/paddlenlp/peft/prefix/utils.py b/paddlenlp/peft/prefix/utils.py index 684245c2f380..93e38cd8a729 100644 --- a/paddlenlp/peft/prefix/utils.py +++ b/paddlenlp/peft/prefix/utils.py @@ -37,6 +37,12 @@ def llama_postprocess_past_key_value(past_key_values): return tuple(zip(keys, values)) +def mistral_postprocess_past_key_value(past_key_values): + # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2 + keys, values = paddle.transpose(past_key_values, perm=[2, 0, 3, 1, 4]).split(2) + + return tuple(zip(keys, values)) + def qwen_postprocess_past_key_value(past_key_values): # (layer_num, bs, prefixlen, head_num/tensor_parallel_degree, head_dim)*2 diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index 1646d64f0229..bb0b4f9e9a11 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -216,7 +216,10 @@ def __init__(self, config: MistralConfig): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta +<<<<<<< Updated upstream self.use_flash_attention = getattr(config, "_flash_attn_2_enabled", False) +======= +>>>>>>> Stashed changes if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -329,7 +332,11 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) +<<<<<<< Updated upstream if not self.use_flash_attention: +======= + if not self.config.use_flash_attention: +>>>>>>> Stashed changes attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: @@ -787,14 +794,26 @@ def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, tensor_parallel_output=T # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg' input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group) logits = paddle.matmul(input_parallel, y, transpose_y=False) +<<<<<<< Updated upstream +======= + print(y) +>>>>>>> Stashed changes if tensor_parallel_output: return logits +<<<<<<< Updated upstream +======= + print(logits) +>>>>>>> Stashed changes return paddle.distributed.collective._c_concat(logits, group=model_parallel_group) else: logits = paddle.matmul(x, y, transpose_y=False) +<<<<<<< Updated upstream +======= + print(y) +>>>>>>> Stashed changes return logits @@ -820,6 +839,10 @@ def forward(self, hidden_states, tensor_parallel_output=None): if tensor_parallel_output is None: tensor_parallel_output = self.config.tensor_parallel_output +<<<<<<< Updated upstream +======= + print(tensor_parallel_output) +>>>>>>> Stashed changes logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) return logits @@ -935,7 +958,14 @@ def forward( ) hidden_states = outputs[0] +<<<<<<< Updated upstream logits = self.lm_head(hidden_states) +======= + print(hidden_states) + logits = self.lm_head(hidden_states) + print(logits) + import pdb;pdb.set_trace() +>>>>>>> Stashed changes logits = logits.astype("float32") loss = None From 81a388c72f1494defe50aa9df302031267f10b3a Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Tue, 2 Jul 2024 15:32:42 +0800 Subject: [PATCH 22/24] add tp cross entropy --- paddlenlp/peft/prefix/__init__.py | 2 +- paddlenlp/peft/prefix/utils.py | 1 + paddlenlp/transformers/mistral/modeling.py | 101 ++++++++------------- 3 files changed, 41 insertions(+), 63 deletions(-) diff --git a/paddlenlp/peft/prefix/__init__.py b/paddlenlp/peft/prefix/__init__.py index 0b7703a97df4..c8bd6e6f07a5 100644 --- a/paddlenlp/peft/prefix/__init__.py +++ b/paddlenlp/peft/prefix/__init__.py @@ -18,6 +18,6 @@ bloom_postprocess_past_key_value, chatglm_postprocess_past_key_value, llama_postprocess_past_key_value, - qwen_postprocess_past_key_value, mistral_postprocess_past_key_value, + qwen_postprocess_past_key_value, ) diff --git a/paddlenlp/peft/prefix/utils.py b/paddlenlp/peft/prefix/utils.py index 93e38cd8a729..50584684568c 100644 --- a/paddlenlp/peft/prefix/utils.py +++ b/paddlenlp/peft/prefix/utils.py @@ -37,6 +37,7 @@ def llama_postprocess_past_key_value(past_key_values): return tuple(zip(keys, values)) + def mistral_postprocess_past_key_value(past_key_values): # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2 keys, values = paddle.transpose(past_key_values, perm=[2, 0, 3, 1, 4]).split(2) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index bb0b4f9e9a11..d0d675d31f73 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +import warnings from functools import partial from typing import List, Optional, Tuple, Union @@ -216,10 +217,6 @@ def __init__(self, config: MistralConfig): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta -<<<<<<< Updated upstream - self.use_flash_attention = getattr(config, "_flash_attn_2_enabled", False) -======= ->>>>>>> Stashed changes if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -290,9 +287,6 @@ def __init__(self, config: MistralConfig): base=self.rope_theta, ) - def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) - def forward( self, hidden_states: paddle.Tensor, @@ -301,7 +295,6 @@ def forward( past_key_value: Optional[Tuple[paddle.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, - padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: bsz, q_len, _ = hidden_states.shape @@ -332,11 +325,7 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) -<<<<<<< Updated upstream - if not self.use_flash_attention: -======= if not self.config.use_flash_attention: ->>>>>>> Stashed changes attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim) if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]: @@ -405,7 +394,6 @@ def forward( past_key_value: Optional[Tuple[paddle.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, - padding_mask: Optional[paddle.Tensor] = None, ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: @@ -433,7 +421,6 @@ def forward( past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, - padding_mask=padding_mask, ) hidden_states = residual + hidden_states @@ -674,28 +661,6 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - padding_mask = None - - # embed positions - if attention_mask is None: - attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool) - elif paddle.any(attention_mask == 0): - padding_mask = attention_mask - - if ( - padding_mask is not None - and hasattr(self.config, "_flash_attn_2_enabled") - and self.config._flash_attn_2_enabled - and past_key_values is not None - ): - is_padding_right = padding_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), @@ -729,7 +694,7 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value - return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask) + return module(*inputs, past_key_value, output_attentions) return custom_forward @@ -747,7 +712,6 @@ def custom_forward(*inputs): past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, - padding_mask=padding_mask, ) hidden_states = layer_outputs[0] @@ -794,26 +758,14 @@ def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, tensor_parallel_output=T # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg' input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group) logits = paddle.matmul(input_parallel, y, transpose_y=False) -<<<<<<< Updated upstream -======= - print(y) ->>>>>>> Stashed changes if tensor_parallel_output: return logits -<<<<<<< Updated upstream -======= - print(logits) ->>>>>>> Stashed changes return paddle.distributed.collective._c_concat(logits, group=model_parallel_group) else: logits = paddle.matmul(x, y, transpose_y=False) -<<<<<<< Updated upstream -======= - print(y) ->>>>>>> Stashed changes return logits @@ -839,14 +791,46 @@ def forward(self, hidden_states, tensor_parallel_output=None): if tensor_parallel_output is None: tensor_parallel_output = self.config.tensor_parallel_output -<<<<<<< Updated upstream -======= print(tensor_parallel_output) ->>>>>>> Stashed changes logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) return logits +class MistralPretrainingCriterion(paddle.nn.Layer): + """ + Criterion for Llama. + It calculates the final loss. + """ + + def __init__(self, config): + + super(MistralPretrainingCriterion, self).__init__() + self.ignore_index = getattr(config, "ignore_index", -100) + self.config = config + self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output + + if self.enable_parallel_cross_entropy: # and False: # and lm_head is distributed + self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index) + else: + self.loss_func = CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + def forward(self, prediction_scores, masked_lm_labels): + if self.enable_parallel_cross_entropy: + if prediction_scores.shape[-1] == self.config.vocab_size: + warnings.warn( + f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}" + ) + self.loss_func = CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + with paddle.amp.auto_cast(False): + masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)) + # skip ignore_index which loss == 0 + masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32") + loss = paddle.mean(masked_lm_loss) + + return loss + + class MistralForCausalLM(MistralPreTrainedModel): _tied_weights_keys = ["lm_head.weight"] @@ -855,6 +839,7 @@ def __init__(self, config): self.mistral = MistralModel(config) self.vocab_size = config.vocab_size self.lm_head = MistralLMHead(config) + self.criterion = MistralPretrainingCriterion(config) def get_input_embeddings(self): return self.mistral.embed_tokens @@ -958,20 +943,12 @@ def forward( ) hidden_states = outputs[0] -<<<<<<< Updated upstream - logits = self.lm_head(hidden_states) -======= - print(hidden_states) logits = self.lm_head(hidden_states) - print(logits) - import pdb;pdb.set_trace() ->>>>>>> Stashed changes logits = logits.astype("float32") loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits, labels) + loss = self.criterion(logits, labels) if not return_dict: output = (logits,) + outputs[1:] From 7a6d7804432a8ba9ade63bc9cf8fcec71aa1613b Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Tue, 2 Jul 2024 19:28:09 +0800 Subject: [PATCH 23/24] support zero padding --- llm/config/mistral/README.md | 20 +++ llm/config/mistral/dpo_argument.json | 38 ++++++ llm/{ => config}/mistral/lora_argument.json | 6 +- llm/{ => config}/mistral/pt_argument.json | 6 +- llm/{ => config}/mistral/sft_argument.json | 5 +- llm/data.py | 131 -------------------- llm/run_finetune.py | 4 +- paddlenlp/transformers/mistral/modeling.py | 1 - 8 files changed, 70 insertions(+), 141 deletions(-) create mode 100644 llm/config/mistral/README.md create mode 100644 llm/config/mistral/dpo_argument.json rename llm/{ => config}/mistral/lora_argument.json (85%) rename llm/{ => config}/mistral/pt_argument.json (88%) rename llm/{ => config}/mistral/sft_argument.json (86%) delete mode 100644 llm/data.py diff --git a/llm/config/mistral/README.md b/llm/config/mistral/README.md new file mode 100644 index 000000000000..a20090e4688c --- /dev/null +++ b/llm/config/mistral/README.md @@ -0,0 +1,20 @@ +# Mistral + +## 1. 模型介绍 + +**支持模型权重:** + +| Model | +|--------------------------------------| +| mistralai/Mistral-7B-Instruct-v0.3 | +| mistralai/Mistral-7B-v0.1 | + + + +使用方法: + +```python +from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") +``` diff --git a/llm/config/mistral/dpo_argument.json b/llm/config/mistral/dpo_argument.json new file mode 100644 index 000000000000..11480b5fd659 --- /dev/null +++ b/llm/config/mistral/dpo_argument.json @@ -0,0 +1,38 @@ +{ + "model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", + "train_dataset_path": "./dpo_data/train.jsonl", + "dev_dataset_path": "./dpo_data/train.jsonl", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 8, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "max_steps": 100, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "steps", + "save_strategy": "steps", + "eval_steps": 100, + "save_steps": 500, + "max_seq_len": 4096, + "max_prompt_len": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 8, + "sharding_parallel_degree": 1, + "sharding": "stage1", + "use_flash_attention": true, + "recompute": false, + "recompute_granularity": "full", + "dpo_beta": 0.1, + "benchmark": false, + "dpo_loss_type": "sigmoid", + "dpo_label_smoothing": 0.0, + "unified_checkpoint": true, + "autotuner_benchmark":false + } diff --git a/llm/mistral/lora_argument.json b/llm/config/mistral/lora_argument.json similarity index 85% rename from llm/mistral/lora_argument.json rename to llm/config/mistral/lora_argument.json index ccb3af015cba..a04db794f356 100644 --- a/llm/mistral/lora_argument.json +++ b/llm/config/mistral/lora_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "mistralai/Mistral-7B-v0.1", + "model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_lora_ckpts", "per_device_train_batch_size": 4, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -26,5 +26,7 @@ "save_total_limit": 1, "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, + "use_flash_attention": true, + "zero_padding": true, "lora": true } diff --git a/llm/mistral/pt_argument.json b/llm/config/mistral/pt_argument.json similarity index 88% rename from llm/mistral/pt_argument.json rename to llm/config/mistral/pt_argument.json index 750293f2f645..b3728227e5ca 100644 --- a/llm/mistral/pt_argument.json +++ b/llm/config/mistral/pt_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "mistralai/Mistral-7B-v0.1", + "model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_pt_ckpts", "per_device_train_batch_size": 4, @@ -14,7 +14,7 @@ "save_strategy": "epoch", "src_length": 1024, "max_length": 2048, - "fp16": true, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, @@ -22,7 +22,7 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": true, + "recompute": false, "save_total_limit": 1, "tensor_parallel_degree": 1, "pipeline_parallel_degree": 1, diff --git a/llm/mistral/sft_argument.json b/llm/config/mistral/sft_argument.json similarity index 86% rename from llm/mistral/sft_argument.json rename to llm/config/mistral/sft_argument.json index 2a9b8b42cc26..3532e86404c5 100644 --- a/llm/mistral/sft_argument.json +++ b/llm/config/mistral/sft_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "mistralai/Mistral-7B-v0.1", + "model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/mistral_sft_ckpts", "per_device_train_batch_size": 4, @@ -24,6 +24,7 @@ "metric_for_best_model": "accuracy", "recompute": true, "save_total_limit": 1, - "tensor_parallel_degree": 4, + "zero_padding": true, + "tensor_parallel_degree": 8, "pipeline_parallel_degree": 1 } diff --git a/llm/data.py b/llm/data.py deleted file mode 100644 index 5fcf4ecb7770..000000000000 --- a/llm/data.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np - -from paddlenlp.peft import LoRAModel, PrefixModelForCausalLM - - -def get_convert_example(model): - if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM): - base_model_prefix = model.model.base_model_prefix - else: - base_model_prefix = model.base_model_prefix - - if base_model_prefix == "chatglm": - return convert_example_chatglm - elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mistral"]: - return convert_example_common - else: - raise ValueError( - f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama." - ) - - -class DataFormatError(ValueError): - pass - - -def tokenize_example(tokenizer, example, data_args): - if "src" in example and "tgt" in example: - source = example["src"][0] if isinstance(example["src"], list) else example["src"] - target = example["tgt"][0] if isinstance(example["tgt"], list) else example["tgt"] - else: - raise DataFormatError( - f"Example format is wrong, please check: {example} or rewrite tokenize_example in data.py " - ) - tokenized_source = tokenizer( - source, - max_length=data_args.src_length, - truncation=True, - truncation_side="left", - add_special_tokens=True, - ) - tgt_max_length = data_args.max_length - len(tokenized_source["input_ids"]) - tokenized_target = tokenizer( - target, - max_length=tgt_max_length, - truncation=True, - truncation_side="right", - add_special_tokens=False, - ) - - tokenized_target_input_ids = tokenized_target["input_ids"] - # Add eos_token_id at the end of sequence if the sentence is not truncated. - # Attention! In some cases(ex. ChatGLMv2), tokenized eos_token is not equal to eos_token_id. - if len(tokenized_target_input_ids) < tgt_max_length: - tokenized_target_input_ids += [tokenizer.eos_token_id] - - return tokenized_source, tokenized_target_input_ids - - -def convert_example_common(example, tokenizer, data_args, is_test=True, intokens=False): - tokenized_source, tokenized_target_input_ids = tokenize_example(tokenizer, example, data_args) - - if is_test: - return { - **tokenized_source, - "labels": tokenized_target_input_ids, - } - else: - input_ids = tokenized_source["input_ids"] + tokenized_target_input_ids - source_length = len(tokenized_source["input_ids"]) - labels = [-100] * source_length + input_ids[source_length:] - # shift input_ids and labels - input_ids, labels = input_ids[:-1], labels[1:] - seq_length = len(input_ids) - features = {"input_ids": input_ids, "labels": labels} - if "position_ids" in tokenized_source: - features["position_ids"] = list(range(seq_length)) - if intokens: - features["attention_mask"] = np.tri(seq_length, seq_length, dtype=bool) - - return features - - -def convert_example_chatglm(example, tokenizer, data_args, is_test=True, intokens=False): - - tokenized_source, tokenized_target_input_ids = tokenize_example(tokenizer, example, data_args) - if is_test: - return { - **tokenized_source, - "labels": tokenized_target_input_ids, - } - else: - input_ids = tokenized_source["input_ids"] + tokenized_target_input_ids - bos_position = len(tokenized_source["input_ids"]) - 1 - labels = [-100] * bos_position + input_ids[bos_position:] - # shift input_ids and labels - input_ids, labels = input_ids[:-1], labels[1:] - features = { - "input_ids": input_ids, - "labels": labels, - } - - if intokens: - seq_length = len(input_ids) - # attention_mask - attention_mask = np.tri(seq_length, seq_length, dtype=bool) - attention_mask[:, :bos_position] = 1 - features["attention_mask"] = attention_mask - # 2d position_ids - position_ids = np.arange(seq_length, dtype=np.int64) - block_position_ids = np.concatenate( - [ - np.zeros(bos_position, dtype=np.int64), - np.arange(1, seq_length - bos_position + 1, dtype=np.int64), - ] - ) - features["position_ids"] = np.stack([position_ids, block_position_ids], axis=0) - - return features diff --git a/llm/run_finetune.py b/llm/run_finetune.py index 8df3705fe335..de31240d2ae3 100644 --- a/llm/run_finetune.py +++ b/llm/run_finetune.py @@ -338,11 +338,11 @@ def neft_post_hook(module, input, output): if data_args.zero_padding: if ( - model.base_model_prefix not in ["llama", "bloom", "chatglm", "chatglm_v2", "qwen"] + model.base_model_prefix not in ["llama", "bloom", "chatglm", "chatglm_v2", "qwen", "mistral"] and training_args.pipeline_parallel_degree < 1 ): raise NotImplementedError( - "Zero Padding data stream is only implemented for LLaMA, Bloom, ChatGLM and QWen so far." + "Zero Padding data stream is only implemented for LLaMA, Bloom, ChatGLM, QWen and Mistral so far." ) train_ds = ( train_ds.map(partial(trans_func, is_test=False, zero_padding=data_args.zero_padding, flash_mask=model_args.flash_mask)) diff --git a/paddlenlp/transformers/mistral/modeling.py b/paddlenlp/transformers/mistral/modeling.py index d0d675d31f73..f973390f0c5c 100644 --- a/paddlenlp/transformers/mistral/modeling.py +++ b/paddlenlp/transformers/mistral/modeling.py @@ -791,7 +791,6 @@ def forward(self, hidden_states, tensor_parallel_output=None): if tensor_parallel_output is None: tensor_parallel_output = self.config.tensor_parallel_output - print(tensor_parallel_output) logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) return logits From cdbd0ebebaaf07d59bd872b0604470589eb0eb2c Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Tue, 2 Jul 2024 19:32:11 +0800 Subject: [PATCH 24/24] update README.md --- llm/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/README.md b/llm/README.md index adeae7b03edd..745fe0ea6ca4 100644 --- a/llm/README.md +++ b/llm/README.md @@ -23,6 +23,7 @@ | [LLaMA](./config/llama) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [Qwen](./config/qwen) | ✅ | ✅ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ✅ | | [Mixtral](./config/mixtral) | ✅ | ✅ | ✅ | ❌ | 🚧 | 🚧 | 🚧 | 🚧 | +| [Mistral](./config/mistral) | ❌ | ✅ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ✅ | | [Baichuan/Baichuan2](./config/llama) | ✅ | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ | ✅ | | [ChatGLM-6B](./config/chatglm) | ❌ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ✅ | ❌ | | [ChatGLM2/ChatGLM3](./config/chatglm2) | ❌ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ✅ | ✅ |