PaddlePaddle · wawltor · Jun 14, 2024 · May 23, 2024 · May 23, 2024 · May 24, 2024
diff --git a/llm/argument.py b/llm/argument.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional
 
 from paddlenlp.trainer import TrainingArguments
 from paddlenlp.trainer.trainer_utils import IntervalStrategy
@@ -105,30 +105,17 @@ class ModelArgument:
     model_name_or_path: str = field(
         default=None, metadata={"help": "Build-in pretrained model name or the path to local model."}
     )
-    use_flash_attention: bool = field(default=False, metadata={"help": "Whether to use flash attention"})
     tokenizer_name_or_path: Optional[str] = field(
         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
-    use_fused_rms_norm: bool = field(
-        default=False,
-        metadata={"help": "llama or other model, use_fused_rms_norm"},
-    )
     fuse_attention_qkv: bool = field(
-        default=False,
+        default=None,
         metadata={"help": "whether to fuse attention qkv"},
     )
     fuse_attention_ffn: bool = field(
-        default=False,
+        default=None,
         metadata={"help": "whether to fuse first up and gate proj in mlp block"},
     )
-    recompute_granularity: str = field(
-        default="full",
-        metadata={"help": "Choose among ['full', 'core_attn', 'full_attn']"},
-    )
-    virtual_pp_degree: int = field(
-        default=1,
-        metadata={"help": "virtual_pp_degree"},
-    )
     hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."})
     attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."})
 
@@ -138,32 +125,6 @@ class ModelArgument:
             "help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
         },
     )
-    sequence_parallel: bool = field(
-        default=False,
-        metadata={"help": "whether to use sequence parallel"},
-    )
-    fuse_sequence_parallel_allreduce: bool = field(
-        default=False,
-        metadata={"help": "whether to use fuse sequence parallel allreduce"},
-    )
-    use_fused_rope: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Enable rope fusion or not."},
-    )
-    no_recompute_layers: Optional[List[int]] = field(
-        default=None,
-        metadata={"help": "Specify the full transformer layers that should not be recomputed."},
-    )
-    pp_recompute_interval: int = field(
-        default=1,
-        metadata={
-            "help": "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0."
-        },
-    )
-    recompute_use_reentrant: bool = field(
-        default=False,
-        metadata={"help": "recompute_use_reentrant"},
-    )
     weight_quantize_algo: str = field(
         default=None,
         metadata={

diff --git a/llm/finetune_generation.py b/llm/finetune_generation.py
@@ -49,10 +49,12 @@
 from paddlenlp.transformers import (
     AutoConfig,
     AutoModelForCausalLM,
+    AutoModelForCausalLMPipe,
     AutoTokenizer,
     Llama3Tokenizer,
     LlamaTokenizer,
 )
+from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
 from paddlenlp.utils.log import logger
 
 # Fine-tune Environment Variables to support sharding stage1 overlap optimization.
@@ -68,6 +70,7 @@ def docstring_decorator(fn):
 
 
 @dataclass
+@llmmetaclass
 @add_start_docstrings(TrainingArguments.__doc__)
 class FinetuneArguments(TrainingArguments):
     decay_steps: int = field(
@@ -146,65 +149,45 @@ def main():
 
     model_config = AutoConfig.from_pretrained(
         model_args.model_name_or_path,
-        tensor_parallel_output=training_args.tensor_parallel_output,
-        tensor_parallel_degree=training_args.tensor_parallel_degree,
-        tensor_parallel_rank=training_args.tensor_parallel_rank,
         dtype=dtype,
         from_aistudio=model_args.from_aistudio,
         quantization_config=quantization_config,
     )
-    if hasattr(model_config, "use_flash_attention"):
-        model_config.use_flash_attention = model_args.use_flash_attention
-
-    model_config.use_fused_rms_norm = model_args.use_fused_rms_norm
-    model_config.fuse_attention_qkv = model_args.fuse_attention_qkv
-    model_config.fuse_attention_ffn = model_args.fuse_attention_ffn
-    model_config.recompute_granularity = model_args.recompute_granularity
-    model_config.virtual_pp_degree = model_args.virtual_pp_degree
-    model_config.sequence_parallel = model_args.sequence_parallel
-    model_config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
-    model_config.use_fused_rope = model_args.use_fused_rope
-
-    model_config.no_recompute_layers = model_args.no_recompute_layers
-    model_config.pp_recompute_interval = model_args.pp_recompute_interval
-    model_config.recompute_use_reentrant = model_args.recompute_use_reentrant
-    model_config.use_recompute = training_args.recompute
-
-    model_config.tensor_parallel_degree = training_args.tensor_parallel_degree
-    model_config.tensor_parallel_rank = training_args.tensor_parallel_rank
+
+    LlmMetaConfig.set_llm_config(model_config, training_args)
 
     # Config for model using dropout, such as GPT.
-    model_config.hidden_dropout_prob = model_args.hidden_dropout_prob
-    model_config.attention_probs_dropout_prob = model_args.attention_probs_dropout_prob
+    if hasattr(model_config, "hidden_dropout_prob"):
+        model_config.hidden_dropout_prob = model_args.hidden_dropout_prob
+    if hasattr(model_config, "attention_probs_dropout_prob"):
+        model_config.attention_probs_dropout_prob = model_args.attention_probs_dropout_prob
+
+    if model_args.fuse_attention_qkv is not None:
+        model_config.fuse_attention_qkv = model_args.fuse_attention_qkv
+    if model_args.fuse_attention_ffn is not None:
+        model_config.fuse_attention_ffn = model_args.fuse_attention_ffn
 
-    model_config.sep_parallel_degree = training_args.sep_parallel_degree
-    model_config.tensor_parallel_output = training_args.tensor_parallel_output
     model_config.seq_length = data_args.max_length
 
+    print("Final model config:", model_config)
+
+    model_class = AutoModelForCausalLM
     if training_args.pipeline_parallel_degree > 1:
         if data_args.eval_with_do_generation and training_args.do_eval:
             raise ValueError("Plese set eval_with_do_generation to false in pipeline parallel mode.")
-        from paddlenlp.transformers import AutoModelForCausalLMPipe
 
-        if not training_args.autotuner_benchmark:
-            model = AutoModelForCausalLMPipe.from_pretrained(
-                model_args.model_name_or_path,
-                config=model_config,
-                from_aistudio=model_args.from_aistudio,
-            )
-        else:
-            # NOTE(gongenlei): new add autotuner_benchmark
-            model = AutoModelForCausalLMPipe.from_config(model_config, dtype=dtype)
+        model_class = AutoModelForCausalLMPipe
+
+    if not training_args.autotuner_benchmark:
+        model = model_class.from_pretrained(
+            model_args.model_name_or_path,
+            config=model_config,
+            from_aistudio=model_args.from_aistudio,
+        )
     else:
-        if not training_args.autotuner_benchmark:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                config=model_config,
-                from_aistudio=model_args.from_aistudio,
-            )
-        else:
-            # NOTE(gongenlei): new add autotuner_benchmark
-            model = AutoModelForCausalLM.from_config(model_config, dtype=dtype)
+        # NOTE(gongenlei): new add autotuner_benchmark
+        model = model_class.from_config(model_config, dtype=dtype)
+
     if training_args.do_train and model_args.neftune:
         # Inspired by https://github.com/neelsjain/NEFTune
         if hasattr(model, "get_input_embeddings"):

diff --git a/llm/llama/tests/unified-ckpt-llama-500m/config.json b/llm/llama/tests/unified-ckpt-llama-500m/config.json
@@ -12,7 +12,6 @@
   "num_attention_heads": 8,
   "num_hidden_layers": 8,
   "pad_token_id": 0,
-  "paddlenlp_version": null,
   "rms_norm_eps": 1e-06,
   "vocab_size": 32000
 }