PaddlePaddle · wtmlon · Jul 3, 2024 · Oct 24, 2023 · Oct 24, 2023 · Oct 26, 2023
diff --git a/llm/data.py b/llm/data.py
@@ -24,7 +24,7 @@ def get_convert_example(model):
 
     if base_model_prefix == "chatglm":
         return convert_example_chatglm
-    elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen"]:
+    elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mistral"]:
         return convert_example_common
     else:
         raise ValueError(

diff --git a/llm/mistral/lora_argument.json b/llm/mistral/lora_argument.json
@@ -0,0 +1,30 @@
+{
+    "model_name_or_path": "mistralai/Mistral-7B-v0.1",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/mistral_lora_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "recompute": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true
+  }
diff --git a/llm/mistral/pt_argument.json b/llm/mistral/pt_argument.json
@@ -0,0 +1,30 @@
+{
+    "model_name_or_path": "mistralai/Mistral-7B-v0.1",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/mistral_pt_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-02,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "fp16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "prefix_tuning": true
+  }
diff --git a/llm/mistral/sft_argument.json b/llm/mistral/sft_argument.json
@@ -0,0 +1,29 @@
+{
+    "model_name_or_path": "mistralai/Mistral-7B-v0.1",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/mistral_sft_ckpts",
+    "per_device_train_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 3,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": true,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 4,
+    "pipeline_parallel_degree": 1
+  }
diff --git a/llm/utils.py b/llm/utils.py
@@ -86,6 +86,14 @@ def get_prefix_tuning_params(model):
         hidden_size = model.config.hidden_size
         postprocess_past_key_value = qwen_postprocess_past_key_value
         multi_query_group_num = None
+    elif model.base_model_prefix == "mistral":
+        from paddlenlp.peft.prefix import llama_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_hidden_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = llama_postprocess_past_key_value
+        multi_query_group_num = model.config.num_attention_heads // model.config.num_key_value_heads
     else:
         raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}. ")
     return dict(
@@ -112,7 +120,7 @@ def get_lora_target_modules(model):
         ]
     elif model.base_model_prefix == "bloom":
         target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"]
-    elif model.base_model_prefix == "llama" or isinstance(model, LlamaForCausalLMPipe):
+    elif model.base_model_prefix == "llama" or model.base_model_prefix == "mistral" or isinstance(model, LlamaForCausalLMPipe):
         target_modules = [
             ".*q_proj.*",
             ".*v_proj.*",

diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
@@ -279,6 +279,8 @@
 from .rw.modeling import *
 from .rw.configuration import *
 from .rw.tokenizer import *
+from .mistral.modeling import *
+from .mistral.configuration import *
 from .qwen import *
 
 # For faster tokenizer

diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py
@@ -127,6 +127,7 @@
         ("Blip", "blip"),
         ("Bloom", "bloom"),
         ("QWen", "qwen"),
+        ("Mistral", "mistral"),
     ]
 )
 

diff --git a/paddlenlp/transformers/mistral/__init__.py b/paddlenlp/transformers/mistral/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration import MistralConfig
+from .modeling import MistralForCausalLM
diff --git a/paddlenlp/transformers/mistral/configuration.py b/paddlenlp/transformers/mistral/configuration.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from ..configuration_utils import PretrainedConfig
+
+
+class MistralConfig(PretrainedConfig):
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )