supports electra

LiuChiachi · LiuChiachi · commit f7c336e52bba · 2022-09-22T13:37:22.000Z
diff --git a/docs/compression.md b/docs/compression.md
@@ -117,7 +117,7 @@ compression_args = parser.parse_args_into_dataclasses()
 
 #### Trainer 实例化参数介绍
 
-- **--model** 待压缩的模型，目前支持 ERNIE、BERT、RoBERTa、ERNIE-M、ERNIE-Gram、PP-MiniLM、TinyBERT 等结构相似的模型，是在下游任务中微调后的模型，当预训练模型选择 ERNIE 时，需要继承 `ErniePretrainedModel`。以分类任务为例，可通过`AutoModelForSequenceClassification.from_pretrained(model_name_or_path)` 等方式来获取，这种情况下，`model_name_or_path`目录下需要有 model_config.json, model_state.pdparams 文件；
+- **--model** 待压缩的模型，目前支持 ERNIE、BERT、RoBERTa、ERNIE-M、ELECTRA、ERNIE-Gram、PP-MiniLM、TinyBERT 等结构相似的模型，是在下游任务中微调后的模型，当预训练模型选择 ERNIE 时，需要继承 `ErniePretrainedModel`。以分类任务为例，可通过`AutoModelForSequenceClassification.from_pretrained(model_name_or_path)` 等方式来获取，这种情况下，`model_name_or_path`目录下需要有 model_config.json, model_state.pdparams 文件；
 - **--data_collator** 三类任务均可使用 PaddleNLP 预定义好的 [DataCollator 类](../../paddlenlp/data/data_collator.py)，`data_collator` 可对数据进行 `Pad` 等操作。使用方法参考 [示例代码](../model_zoo/ernie-3.0/compress_seq_cls.py) 即可；
 - **--train_dataset** 裁剪训练需要使用的训练集，是任务相关的数据。自定义数据集的加载可参考 [文档](https://huggingface.co/docs/datasets/loading)。不启动裁剪时，可以为 None；
 - **--eval_dataset** 裁剪训练使用的评估集，也是量化使用的校准数据，是任务相关的数据。自定义数据集的加载可参考 [文档](https://huggingface.co/docs/datasets/loading)。是 Trainer 的必选参数；
diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py
@@ -268,6 +268,24 @@ def _dynabert_init(self, model, eval_dataloader):
     return ofa_model, teacher_model
 
 
+def check_dynabert_config(net_config, width_mult):
+    '''
+    Corrects net_config for OFA model if necessary.
+    '''
+    if 'electra.embeddings_project' in net_config:
+        net_config["electra.embeddings_project"]['expand_ratio'] = 1.0
+    for key in net_config:
+        # Makes sure to expands the size of the last dim to `width_mult` for
+        # these Linear weights.
+        if 'q_proj' in key or 'k_proj' in key or 'v_proj' in key or 'linear1' in key:
+            net_config[key]['expand_ratio'] = width_mult
+        # Keeps the size of the last dim of these Linear weights same as
+        # before.
+        elif 'out_proj' in key or 'linear2' in key:
+            net_config[key]['expand_ratio'] = 1.0
+    return net_config
+
+
 def _dynabert_training(self, ofa_model, model, teacher_model, train_dataloader,
                        eval_dataloader, num_train_epochs):
 
@@ -388,6 +406,7 @@ def evaluate_token_cls(model, data_loader):
                 # Step8: Broadcast supernet config from width_mult,
                 # and use this config in supernet training.
                 net_config = utils.dynabert_config(ofa_model, width_mult)
+                net_config = check_dynabert_config(net_config, width_mult)
                 ofa_model.set_net_config(net_config)
                 if "token_type_ids" in batch:
                     logits, teacher_logits = ofa_model(
@@ -424,6 +443,7 @@ def evaluate_token_cls(model, data_loader):
             if global_step % self.args.save_steps == 0:
                 for idx, width_mult in enumerate(self.args.width_mult_list):
                     net_config = utils.dynabert_config(ofa_model, width_mult)
+                    net_config = check_dynabert_config(net_config, width_mult)
                     ofa_model.set_net_config(net_config)
                     tic_eval = time.time()
                     logger.info("width_mult %s:" % round(width_mult, 2))
@@ -479,6 +499,7 @@ def _dynabert_export(self, ofa_model):
         origin_model = self.model.__class__.from_pretrained(model_dir)
         ofa_model.model.set_state_dict(state_dict)
         best_config = utils.dynabert_config(ofa_model, width_mult)
+        best_config = check_dynabert_config(best_config, width_mult)
         origin_model_new = ofa_model.export(best_config,
                                             input_shapes=[[1, 1], [1, 1]],
                                             input_dtypes=['int64', 'int64'],
@@ -561,7 +582,9 @@ def _batch_generator_func():
             optimize_model=False)
         post_training_quantization.quantize()
         post_training_quantization.save_quantized_model(
-            save_model_path=os.path.join(model_dir, algo + str(batch_size)),
+            save_model_path=os.path.join(
+                model_dir, algo +
+                "_".join([str(batch_size), str(batch_nums)])),
             model_filename=args.output_filename_prefix + ".pdmodel",
             params_filename=args.output_filename_prefix + ".pdiparams")
 
@@ -632,6 +655,8 @@ def auto_model_forward(self,
     embedding_kwargs["input_ids"] = input_ids
 
     embedding_output = self.embeddings(**embedding_kwargs)
+    if hasattr(self, "embeddings_project"):
+        embedding_output = self.embeddings_project(embedding_output)
 
     self.encoder._use_cache = use_cache  # To be consistent with HF