From 8a99eb45e08f8b952baa73fb387e9d4717be7c92 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 15 Nov 2022 02:48:18 +0000
Subject: [PATCH 01/10] fix bug for t5 which will occured when encoder_output
 is not None

---
 model_zoo/uie/evaluate.py             | 2 +-
 paddlenlp/transformers/t5/modeling.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/model_zoo/uie/evaluate.py b/model_zoo/uie/evaluate.py
index 3527a4b7e377..c88d748e5735 100644
--- a/model_zoo/uie/evaluate.py
+++ b/model_zoo/uie/evaluate.py
@@ -45,7 +45,7 @@ def evaluate(model, metric, data_loader, multilingual=False):
         else:
             start_prob, end_prob = model(batch["input_ids"],
                                          batch["token_type_ids"],
-                                         batch["att_mask"], batch["pos_ids"])
+                                         batch["pos_ids"], batch["att_mask"])
 
         start_ids = paddle.cast(batch["start_positions"], 'float32')
         end_ids = paddle.cast(batch["end_positions"], 'float32')
diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index 8dda537f1135..5a46c4b07754 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1673,7 +1673,9 @@ def forward(self,
                 logits = output[1]
 
         """
-
+        input_type = type(
+            decoder_input_ids) if decoder_input_ids is not None else type(
+                decoder_inputs_embeds)
         # Encode if needed (training, first prediction pass)
         if encoder_output is None:
             # Convert encoder inputs in embeddings if needed
@@ -1685,7 +1687,7 @@ def forward(self,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict)
         else:
-            if isinstance(encoder_output, type(decoder_input_ids)):
+            if isinstance(encoder_output, input_type):
                 encoder_output = (encoder_output, )
             if return_dict and not isinstance(encoder_output, BaseModelOutput):
                 encoder_output = convert_encoder_output(encoder_output)

From 617432657a7ec544bf989950eefeefc3f34f5474 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 15 Nov 2022 10:26:56 +0000
Subject: [PATCH 02/10] add inputs_embeds to bart and force use_cache=False
 when labels is provided to save memory during training

---
 paddlenlp/transformers/bart/modeling.py  | 217 ++++++++++++++++++-----
 tests/transformers/bart/test_modeling.py |   1 +
 2 files changed, 169 insertions(+), 49 deletions(-)

diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py
index 6db19abd1e18..3fcd97fad859 100644
--- a/paddlenlp/transformers/bart/modeling.py
+++ b/paddlenlp/transformers/bart/modeling.py
@@ -20,6 +20,7 @@
 import paddle.nn.functional as F
 import paddle.tensor as tensor
 from paddle.nn import Layer, Embedding
+from ...utils.log import logger
 
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (
@@ -194,6 +195,7 @@ def __init__(self,
     def forward(self,
                 input_ids=None,
                 attention_mask=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False,
@@ -206,6 +208,8 @@ def forward(self,
                 See :class:`BartModel`.
             attention_mask (Tensor, optional):
                 See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
             output_attentions (bool, optional):
                 See :class:`BartModel`.
             output_hidden_states (bool, optional):
@@ -223,15 +227,28 @@ def forward(self,
             Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
 
         """
-        if input_ids is None:
-            raise ValueError("Input_ids cannot be None.")
-        inputs_embeds = self.embed_tokens(input_ids)
-        inputs_embed_pos = self.encoder_embed_positions(paddle.shape(input_ids))
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, inputs_shape[-1]))
+        elif inputs_embeds is not None:
+            inputs_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embed_pos = self.encoder_embed_positions(inputs_shape)
         hidden_states = inputs_embeds + inputs_embed_pos
         hidden_states = self.encoder_layernorm_embedding(hidden_states)
         encoder_input = self.encoder_dropout(hidden_states)
 
-        if attention_mask is None:
+        if attention_mask is None and input_ids is not None:
             attention_mask = paddle.cast(
                 input_ids == self.pad_token_id,
                 dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
@@ -297,6 +314,7 @@ def forward(self,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 memory_mask=None,
+                decoder_inputs_embeds=None,
                 cache=None,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -313,6 +331,8 @@ def forward(self,
                 See :class:`BartModel`.
             memory_mask (Tensor, optional):
                 See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
             cache (Tensor, optional):
                 See :class:`BartModel`.
             output_attentions (bool, optional):
@@ -332,17 +352,36 @@ def forward(self,
             Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
 
         """
+        # retrieve input_ids and inputs_embeds
+        if decoder_input_ids is not None and decoder_inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif decoder_input_ids is not None:
+            inputs_shape = decoder_input_ids.shape
+            decoder_input_ids = decoder_input_ids.reshape(
+                (-1, inputs_shape[-1]))
+        elif decoder_inputs_embeds is not None:
+            inputs_shape = decoder_inputs_embeds.shape[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = inputs_shape[-1]
             decoder_attention_mask = paddle.tensor.triu((paddle.full(
                 (decoder_length, decoder_length),
                 -np.inf,
                 dtype=paddle.get_default_dtype())), 1)
-        decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
+
+        if decoder_inputs_embeds is None:
+            decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
+
         past_key_values_length = paddle.shape(
             cache[0][0].k)[2] if cache is not None else 0
         decoder_inputs_embed_pos = self.decoder_embed_positions(
-            paddle.shape(decoder_input_ids), past_key_values_length)
+            inputs_shape, past_key_values_length)
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
         hidden_states = self.decoder_layernorm_embedding(hidden_states)
         decoder_input = self.decoder_dropout(hidden_states)
@@ -483,11 +522,13 @@ def set_input_embeddings(self, value):
         self.shared = value
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=False,
                 cache=None,
                 output_attentions=False,
@@ -527,6 +568,19 @@ def forward(self,
                 For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
                 `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
                 For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used, 
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). 
+                This is useful if you want more control over how to convert `decoder_input_ids` indices 
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
             use_cache (bool, optional):
                  Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
                  can be used to speed up decoding.
@@ -568,40 +622,51 @@ def forward(self,
         '''
         # different to other models, Bart automatically creates decoder_input_ids from
         # inputBartForSequenceClassification_ids if no decoder_input_ids are provided
-        if input_ids is None and encoder_output is None:
+        if input_ids is None and inputs_embeds is None and encoder_output is None:
             raise ValueError(
                 "You have to specify either input_ids or encoder_output")
-        if decoder_input_ids is None:
-            assert input_ids is not None, "input_ids should be " \
-                                          "specified when generating decoder_input_ids"
+
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
             decoder_input_ids = shift_tokens_right(input_ids,
                                                    self.decoder_start_token_id)
-        if attention_mask is None:
-            assert input_ids is not None, "input_ids should be " \
-                                          "specified when generating attention_mask"
+        if attention_mask is None and input_ids is not None:
+            # only generate attention_mask when input_ids is specified
             attention_mask = paddle.cast(
                 input_ids == self.pad_token_id,
                 dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+        if inputs_embeds is not None and input_ids is None and attention_mask is None:
+            logger.warning("provided inputs_embeds without attention_mask")
         # For 2D attention_mask from tokenizer
         elif attention_mask.ndim == 2:
             attention_mask = paddle.unsqueeze(
                 attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
             attention_mask = (1.0 - attention_mask) * -1e4
             attention_mask.stop_gradient = True
+
+        input_type = type(
+            decoder_input_ids) if decoder_input_ids is not None else type(
+                decoder_inputs_embeds)
         if encoder_output is None:
             encoder_output = self.encoder(
                 input_ids,
                 attention_mask,
+                inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_output, ModelOutput):
-            if isinstance(encoder_output, type(decoder_input_ids)):
+            if isinstance(encoder_output, input_type):
                 encoder_output = (encoder_output, )
             encoder_output = convert_encoder_output(encoder_output)
-        if isinstance(encoder_output, type(decoder_input_ids)):
+        if isinstance(encoder_output, input_type):
             encoder_last_hidden_state = encoder_output
         else:
             encoder_last_hidden_state = encoder_output[0]
@@ -611,18 +676,20 @@ def forward(self,
                     encoder_last_hidden_state)
         else:
             cache = None
-        decoder_output = self.decoder(decoder_input_ids,
-                                      decoder_attention_mask,
-                                      encoder_last_hidden_state,
-                                      attention_mask,
-                                      cache,
-                                      output_attentions=output_attentions,
-                                      output_hidden_states=output_hidden_states,
-                                      return_dict=return_dict)
+        decoder_output = self.decoder(
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_last_hidden_state,
+            attention_mask,
+            cache=cache,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
         if not return_dict:
-            if isinstance(decoder_output, type(decoder_input_ids)):
+            if isinstance(decoder_output, input_type):
                 decoder_output = (decoder_output, )
-            if isinstance(encoder_output, type(decoder_input_ids)):
+            if isinstance(encoder_output, input_type):
                 encoder_output = (encoder_output, )
             return decoder_output + encoder_output
 
@@ -690,11 +757,13 @@ def __init__(self, bart, num_labels=2, dropout=None):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=False,
                 cache=None,
                 labels=None,
@@ -715,8 +784,12 @@ def forward(self,
                 See :class:`BartModel`.
             encoder_output (Tensor, optonal):
                 See :class:`BartModel`.
-            use_cache (bool, optional):
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
                 See :class:`BartModel`.
+            use_cache (bool, optional):
+                See :class:`BartModel`. Forcely set to `False` when `labels` is provided that can save memory during training.
             cache (Tensor, optional):
                 See :class:`BartModel`.
             labels (Tensor, optional):
@@ -753,30 +826,47 @@ def forward(self,
                 inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
                 logits = model(**inputs)
         """
+        if labels is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `labels` is provided."
+            )
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            logger.warning(
+                f"{self.__class__.__name__} will not detect eos tokens in `inputs_embeds`. Results may be "
+                "unexpected if using eos tokens in conjunction with `inputs_embeds.`"
+            )
+
         outputs = self.bart(
             input_ids,
             attention_mask,
             decoder_input_ids,
             decoder_attention_mask,
             encoder_output,
-            use_cache,
-            cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         output = outputs[0]
-        eos_mask = paddle.cast(input_ids == self.bart.config['eos_token_id'],
-                               dtype='int64')
-        if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
-            raise ValueError(
-                'All examples must have the same number of <eos> tokens.')
-
         output_shape = paddle.shape(output)
-        # TODO(gongenlei): support bool tensor index
-        output = output.masked_select(
-            eos_mask.unsqueeze(-1).astype('bool').tile([1, 1,
-                                                        output_shape[-1]]))
+
+        if input_ids is not None:
+            eos_mask = paddle.cast(
+                input_ids == self.bart.config['eos_token_id'], dtype='int64')
+            if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
+                raise ValueError(
+                    'All examples must have the same number of <eos> tokens.')
+
+            # TODO(gongenlei): support bool tensor index
+            output = output.masked_select(
+                eos_mask.unsqueeze(-1).astype('bool').tile(
+                    [1, 1, output_shape[-1]]))
+
         sentence_representation = output.reshape(
             [output_shape[0], -1, output_shape[-1]])[:, -1, :]
         logits = self.classifier(sentence_representation)
@@ -830,11 +920,13 @@ def __init__(self, bart):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=False,
                 cache=None,
                 start_positions=None,
@@ -856,8 +948,12 @@ def forward(self,
                 See :class:`BartModel`.
             encoder_output (Tensor, optonal):
                 See :class:`BartModel`.
-            use_cache (bool, optional):
+            inputs_embeds (Tensor, optional):
                 See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            use_cache (bool, optional):
+                See :class:`BartModel`. Forcely set to `False` when `start_positions` and `end_positions` are provided that can save memory during training.
             cache (Tensor, optional):
                 See :class:`BartModel`.
             start_positions (Tensor, optional):
@@ -910,13 +1006,21 @@ def forward(self,
                 start_logits = outputs[0]
                 end_logits  =outputs[1]
         """
+        if start_positions is not None and end_positions is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `start_positions` and `end_positions` are provided."
+            )
+            use_cache = False
+
         outputs = self.bart(input_ids,
                             attention_mask,
                             decoder_input_ids,
                             decoder_attention_mask,
                             encoder_output,
-                            use_cache,
-                            cache,
+                            inputs_embeds=inputs_embeds,
+                            decoder_inputs_embeds=decoder_inputs_embeds,
+                            use_cache=use_cache,
+                            cache=cache,
                             output_attentions=output_attentions,
                             output_hidden_states=output_hidden_states,
                             return_dict=return_dict)
@@ -1019,11 +1123,13 @@ def prepare_faster_entry(self, kwargs):
         return self._faster_entry
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=False,
                 cache=None,
                 labels=None,
@@ -1044,6 +1150,10 @@ def forward(self,
                 See :class:`BartModel`.
             encoder_output (Tensor, optonal):
                 See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
             use_cache (bool, optional):
                 See :class:`BartModel`.
             cache (Tensor, optional):
@@ -1088,13 +1198,22 @@ def forward(self,
                 outputs = model(**inputs)
 
         """
+        if labels is not None:
+            if use_cache:
+                logger.warning(
+                    "The `use_cache` argument is changed to `False` since `labels` is provided."
+                )
+            use_cache = False
+
         outputs = self.bart(input_ids,
                             attention_mask,
                             decoder_input_ids,
                             decoder_attention_mask,
                             encoder_output,
-                            use_cache,
-                            cache,
+                            inputs_embeds=inputs_embeds,
+                            decoder_inputs_embeds=decoder_inputs_embeds,
+                            use_cache=use_cache,
+                            cache=cache,
                             output_attentions=output_attentions,
                             output_hidden_states=output_hidden_states,
                             return_dict=return_dict)
diff --git a/tests/transformers/bart/test_modeling.py b/tests/transformers/bart/test_modeling.py
index 763d51096a61..665b01bfc0e1 100644
--- a/tests/transformers/bart/test_modeling.py
+++ b/tests/transformers/bart/test_modeling.py
@@ -436,6 +436,7 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     test_missing_keys = False
     use_labels = False
     return_dict = False
+    use_test_inputs_embeds = True
 
     def setUp(self):
         self.model_tester = BartModelTester(self)

From 64b23f2f2ef36150a383a83eb97f0cbc3393a669 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 15 Nov 2022 11:16:18 +0000
Subject: [PATCH 03/10] add inputs_embeds to mbart and force use_cache=False
 when labels is provided to save memory during training

---
 paddlenlp/transformers/mbart/modeling.py  | 225 +++++++++++++++++-----
 tests/transformers/mbart/test_modeling.py |  44 ++++-
 2 files changed, 223 insertions(+), 46 deletions(-)

diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
index 11031d194fd2..20481bd28e69 100644
--- a/paddlenlp/transformers/mbart/modeling.py
+++ b/paddlenlp/transformers/mbart/modeling.py
@@ -20,7 +20,9 @@
 import paddle.nn.functional as F
 import paddle.tensor as tensor
 from paddle.nn import Layer, Embedding
+from yarl import cache_configure
 
+from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (
     ModelOutput,
@@ -264,6 +266,7 @@ def __init__(self,
     def forward(self,
                 input_ids=None,
                 attention_mask=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False,
@@ -276,21 +279,45 @@ def forward(self,
                 See :class:`MBartModel`.
             attention_mask (Tensor, optional):
                 See :class:`MBartModel`.
+            input_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
 
         Returns:
-            Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model.
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding 
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`, 
+            returns tensor `encoder_outputs` which is the output at the last layer of the model.
             Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
-
         """
-        if input_ids is None:
-            raise ValueError("Input_ids cannot be None.")
-        inputs_embeds = self.d_model**0.5 * self.embed_tokens(input_ids)
-        inputs_embed_pos = self.encoder_embed_positions(paddle.shape(input_ids))
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.d_model**0.5 * self.embed_tokens(input_ids)
+
+        inputs_embed_pos = self.encoder_embed_positions(input_shape)
         hidden_states = inputs_embeds + inputs_embed_pos
         hidden_states = self.encoder_layernorm_embedding(hidden_states)
         encoder_input = self.encoder_dropout(hidden_states)
 
-        if attention_mask is None:
+        if attention_mask is None and input_ids is not None:
             attention_mask = paddle.cast(
                 input_ids == self.pad_token_id,
                 dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
@@ -361,6 +388,7 @@ def forward(
         encoder_output=None,
         memory_mask=None,
         cache=None,
+        decoder_inputs_embeds=None,
         output_attentions=False,
         output_hidden_states=False,
         return_dict=False,
@@ -379,24 +407,55 @@ def forward(
                 See :class:`MBartModel`.
             cache (Tensor, optional):
                 See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
 
         Returns:
-            Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model.
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding 
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`, 
+            returns tensor `decoder_outputs` which is the output at the last layer of the model.
             Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
 
         """
+        # retrieve input_ids and inputs_embeds
+        if decoder_input_ids is not None and decoder_inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif decoder_input_ids is not None:
+            decoder_input_shape = decoder_input_ids.shape
+            decoder_input_ids = decoder_input_ids.reshape(
+                (-1, decoder_input_shape[-1]))
+        elif decoder_inputs_embeds is not None:
+            decoder_input_shape = decoder_inputs_embeds.shape[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
         if decoder_attention_mask is None:
-            decoder_length = paddle.shape(decoder_input_ids)[-1]
+            decoder_length = decoder_input_shape[-1]
             decoder_attention_mask = paddle.tensor.triu((paddle.full(
                 (decoder_length, decoder_length),
                 -np.inf,
                 dtype=paddle.get_default_dtype())), 1)
-        decoder_inputs_embeds = self.d_model**0.5 * self.embed_tokens(
-            decoder_input_ids)
+        if decoder_inputs_embeds is None:
+            decoder_inputs_embeds = self.d_model**0.5 * self.embed_tokens(
+                decoder_input_ids)
+
         past_key_values_length = paddle.shape(
             cache[0][0].k)[2] if cache is not None else 0
         decoder_inputs_embed_pos = self.decoder_embed_positions(
-            decoder_input_ids.shape, past_key_values_length)
+            decoder_input_shape, past_key_values_length)
         hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
         hidden_states = self.decoder_layernorm_embedding(hidden_states)
         decoder_input = self.decoder_dropout(hidden_states)
@@ -535,13 +594,15 @@ def set_input_embeddings(self, value):
         self.shared = value
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -579,6 +640,19 @@ def forward(self,
                 For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
                 `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
                 For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used, 
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). 
+                This is useful if you want more control over how to convert `decoder_input_ids` indices 
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
             use_cache (bool, optional):
                  Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
                  can be used to speed up decoding.
@@ -621,16 +695,21 @@ def forward(self,
         '''
         # different to other models, MBart automatically creates decoder_input_ids from
         # input MBartForSequenceClassification_ids if no decoder_input_ids are provided
-        if input_ids is None and encoder_output is None:
+        if input_ids is None and inputs_embeds is None and encoder_output is None:
             raise ValueError(
-                "You have to specify either input_ids or encoder_output")
-        if decoder_input_ids is None:
-            assert input_ids is not None, "input_ids should be " \
-                                          "specified when generating decoder_input_ids"
+                "You have to specify one of input_ids, inputs_embeds and encoder_output"
+            )
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
             decoder_input_ids = shift_tokens_right(input_ids, self.pad_token_id)
-        if attention_mask is None:
-            assert input_ids is not None, "input_ids should be " \
-                                          "specified when generating attention_mask"
+        if attention_mask is None and input_ids is not None:
+            # assert input_ids is not None, "input_ids should be " \
+            #                               "specified when generating attention_mask"
             attention_mask = paddle.cast(
                 input_ids == self.pad_token_id,
                 dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
@@ -640,20 +719,26 @@ def forward(self,
                 attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
             attention_mask = (1.0 - attention_mask) * -1e4
             attention_mask.stop_gradient = True
+
+        input_type = type(
+            decoder_input_ids) if decoder_input_ids is not None else type(
+                decoder_inputs_embeds)
+
         if encoder_output is None:
             encoder_output = self.encoder(
                 input_ids,
                 attention_mask,
+                inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_output, ModelOutput):
-            if isinstance(encoder_output, type(decoder_input_ids)):
+            if isinstance(encoder_output, input_type):
                 encoder_output = (encoder_output, )
             encoder_output = convert_encoder_output(encoder_output)
-        if isinstance(encoder_output, type(decoder_input_ids)):
+        if isinstance(encoder_output, input_type):
             encoder_last_hidden_state = encoder_output
         else:
             encoder_last_hidden_state = encoder_output[0]
@@ -669,15 +754,16 @@ def forward(self,
             encoder_last_hidden_state,
             attention_mask,
             cache,
+            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
         if not return_dict:
-            if isinstance(decoder_output, type(decoder_input_ids)):
+            if isinstance(decoder_output, input_type):
                 decoder_output = (decoder_output, )
-            if isinstance(encoder_output, type(decoder_input_ids)):
+            if isinstance(encoder_output, input_type):
                 encoder_output = (encoder_output, )
             return decoder_output + encoder_output
 
@@ -744,13 +830,15 @@ def __init__(self, mbart, num_labels=2, dropout=None):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 labels=None,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -773,6 +861,10 @@ def forward(self,
                 See :class:`MBartModel`.
             cache (Tensor, optional):
                 See :class:`MBartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
             labels (Tensor, optional):
                 Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
                 num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy).
@@ -806,30 +898,45 @@ def forward(self,
                 inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
                 logits = model(**inputs)
         """
+        if labels is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `labels` is provided."
+            )
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            logger.warning(
+                f"{self.__class__.__name__} will not detect eos tokens in `inputs_embeds`. Results may be "
+                "unexpected if using eos tokens in conjunction with `inputs_embeds.`"
+            )
+
         outputs = self.mbart(
             input_ids,
             attention_mask,
             decoder_input_ids,
             decoder_attention_mask,
             encoder_output,
-            use_cache,
-            cache,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         output = outputs[0]
-        eos_mask = paddle.cast(input_ids == self.mbart.config['eos_token_id'],
-                               dtype='int64')
-        if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
-            raise ValueError(
-                'All examples must have the same number of <eos> tokens.')
-
         output_shape = paddle.shape(output)
-        # TODO(gongenlei): support bool tensor index
-        output = output.masked_select(
-            eos_mask.unsqueeze(-1).astype('bool').tile([1, 1,
-                                                        output_shape[-1]]))
+        if input_ids is not None:
+            eos_mask = paddle.cast(
+                input_ids == self.mbart.config['eos_token_id'], dtype='int64')
+            if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
+                raise ValueError(
+                    'All examples must have the same number of <eos> tokens.')
+
+            # TODO(gongenlei): support bool tensor index
+            output = output.masked_select(
+                eos_mask.unsqueeze(-1).astype('bool').tile(
+                    [1, 1, output_shape[-1]]))
         sentence_representation = output.reshape(
             [output_shape[0], -1, output_shape[-1]])[:, -1, :]
         logits = self.classifier(sentence_representation)
@@ -883,13 +990,15 @@ def __init__(self, mbart):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 start_positions=None,
                 end_positions=None,
                 output_attentions=False,
@@ -909,6 +1018,10 @@ def forward(self,
                 See :class:`MBartModel`.
             encoder_output (Tensor, optonal):
                 See :class:`MBartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
             use_cache (bool, optional):
                 See :class:`MBartModel`.
             cache (Tensor, optional):
@@ -963,14 +1076,21 @@ def forward(self,
                 start_logits = outputs[0]
                 end_logits  =outputs[1]
         """
+        if start_positions is not None and end_positions is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `start_positions` and `end_positions` are provided."
+            )
+            use_cache = False
         outputs = self.mbart(
             input_ids,
             attention_mask,
             decoder_input_ids,
             decoder_attention_mask,
             encoder_output,
-            use_cache,
-            cache,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -1068,13 +1188,15 @@ def prepare_faster_entry(self, kwargs):
         return self._faster_entry
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 attention_mask=None,
                 decoder_input_ids=None,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 labels=None,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -1093,11 +1215,15 @@ def forward(self,
                 See :class:`MBartModel`.
             encoder_output (Tensor, optonal):
                 See :class:`MBartModel`.
+                See :class:`MBartModel`.
             use_cache (bool, optional):
                 See :class:`MBartModel`.
             cache (Tensor, optional):
                 See :class:`MBartModel`.
-            abels (Tensor, optional):
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+            labels (Tensor, optional):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
@@ -1140,14 +1266,23 @@ def forward(self,
                 outputs = model(**inputs)
 
         """
+        if labels is not None:
+            if use_cache:
+                logger.warning(
+                    "The `use_cache` argument is changed to `False` since `labels` is provided."
+                )
+            use_cache = False
+
         outputs = self.mbart(
             input_ids,
             attention_mask,
             decoder_input_ids,
             decoder_attention_mask,
             encoder_output,
-            use_cache,
-            cache,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
diff --git a/tests/transformers/mbart/test_modeling.py b/tests/transformers/mbart/test_modeling.py
index e24c8fecd158..a954ca5ddef2 100644
--- a/tests/transformers/mbart/test_modeling.py
+++ b/tests/transformers/mbart/test_modeling.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import tempfile
-
+import copy
 from tests.testing_utils import slow, PaddleNLPModelTest
 
 from ..test_generation_utils import GenerationTesterMixin
@@ -252,6 +252,48 @@ def test_decoder_model_past_with_large_inputs(self):
         self.model_tester.create_and_check_decoder_model_past_large_inputs(
             *config_and_inputs)
 
+    def test_inputs_embeds_for_mbart(self):
+        # rewrite test inputs embeds for mbart model since scaler not equal to 1.0
+        # get config for model and inputs_dict for model forward
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
+        )
+        scaler = config["d_model"]**0.5
+        # test all model classes
+        for model_class in self.all_model_classes:
+            model = self._make_model_instance(config, model_class)
+            model.eval()
+
+            inputs = copy.deepcopy(
+                self._prepare_for_class(inputs_dict, model_class))
+
+            with paddle.no_grad():
+                ids_output = model(**inputs)
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids",
+                                               encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids) * scaler
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids) * scaler
+                inputs["decoder_inputs_embeds"] = wte(
+                    decoder_input_ids) * scaler
+
+            with paddle.no_grad():
+                embeds_output = model(**inputs)
+
+            self.assertTrue(
+                paddle.allclose(ids_output, embeds_output, rtol=1e-4,
+                                atol=1e-4))
+
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""

From 7ff588e3cdd0274f0404c63a213aec58e5744c15 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 16 Nov 2022 08:51:23 +0000
Subject: [PATCH 04/10] add inputs_embeds to codegen

---
 paddlenlp/transformers/codegen/modeling.py  | 64 ++++++++++++++-------
 tests/transformers/codegen/test_modeling.py |  1 +
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py
index 1934eb261742..04526a75672b 100644
--- a/paddlenlp/transformers/codegen/modeling.py
+++ b/paddlenlp/transformers/codegen/modeling.py
@@ -17,7 +17,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.nn import Layer, Embedding
-
+from ...utils.log import logger
 from ..nezha.modeling import ACT2FN
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (BaseModelOutputWithPastAndCrossAttentions,
@@ -434,6 +434,7 @@ def forward(
         token_type_ids=None,
         use_cache=False,
         cache=None,
+        inputs_embeds=None,
         output_attentions=False,
         output_hidden_states=False,
         return_dict=False,
@@ -464,6 +465,11 @@ def forward(
                 See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
                 It is only used for inference and should be None for training.
                 Default to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
             output_attentions (bool, optional):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                 tensors for more detail. Defaults to `False`.
@@ -492,12 +498,20 @@ def forward(
                 output = model(**inputs)
         '''
 
-        if input_ids is not None:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
             input_shape = input_ids.shape
-            input_ids = input_ids.reshape(shape=(-1, input_shape[-1]))
+            input_ids = input_ids.reshape((-1, input_shape[-1]))
             batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+            batch_size = inputs_embeds.shape[0]
         else:
-            raise ValueError("You have to specify input_ids")
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
 
         if cache is None:
             past_length = 0
@@ -505,25 +519,30 @@ def forward(
         else:
             past_length = cache[0][0].shape[-2]
 
-        # Attention mask.
+
+#TODO SLOW TEST
+# Attention mask.
         if attention_mask is None:
-            assert input_ids is not None, "input_ids should be " \
-                                          "specified when generating attention_mask"
-            if batch_size == 1 and past_length != 0:
-                batch_size, seq_len = input_shape
-                attention_mask = paddle.zeros(
-                    [batch_size, 1, 1, seq_len + past_length],
-                    dtype=paddle.get_default_dtype())
+            if input_ids is not None:
+                if batch_size == 1 and past_length != 0:
+                    batch_size, seq_len = input_shape
+                    attention_mask = paddle.zeros(
+                        [batch_size, 1, 1, seq_len + past_length],
+                        dtype=paddle.get_default_dtype())
+                else:
+                    attention_mask = paddle.cast(
+                        input_ids == self.pad_token_id,
+                        dtype=paddle.get_default_dtype()).unsqueeze([1, 2
+                                                                     ]) * -1e4
             else:
-                attention_mask = paddle.cast(
-                    input_ids == self.pad_token_id,
-                    dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+                logger.warning("provided inputs_embeds without attention_mask")
         # For 2D attention_mask from tokenizer
         elif attention_mask.ndim == 2:
             attention_mask = paddle.unsqueeze(
                 attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
             attention_mask = (1.0 - attention_mask) * -1e4
-        attention_mask.stop_gradient = True
+        if attention_mask is not None:
+            attention_mask.stop_gradient = True
         # TODO: CodeGen Attention Mask is TOO confusion.
         # When it's 2D, it must be int and it's denoted by 1/0.
         # When using model.generate() without providing attention mask
@@ -531,7 +550,8 @@ def forward(
         # the attention mask's dtype must be float and it's denoted by 0/-inf.
         # Moreover, cannot support 3D attention mask.
 
-        inputs_embeds = self.wte(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
         if token_type_ids is not None:
             token_type_embeds = self.wte(token_type_ids)
             inputs_embeds = inputs_embeds + token_type_embeds
@@ -666,6 +686,7 @@ def forward(self,
                 use_cache=False,
                 cache=None,
                 labels=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -684,12 +705,14 @@ def forward(self,
                 Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set
                 `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100`
                 are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`
+            inputs_embeds (Tensor, optional):
+                See :class:`CodeGenModel`.
             output_attentions (bool, optional):
-                See :class: `CodeGenModel`
+                See :class: `CodeGenModel`.
             output_hidden_states (bool, optional):
-                See :class: `CodeGenModel`
+                See :class: `CodeGenModel`.
             return_dict (bool, optional):
-                See :class: `CodeGenModel`
+                See :class: `CodeGenModel`.
         Returns:
             An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions` if
             `return_dict=True`. Otherwise it returns a tuple of tensors corresponding 
@@ -715,6 +738,7 @@ def forward(self,
             token_type_ids=token_type_ids,
             use_cache=use_cache,
             cache=cache,
+            inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)
diff --git a/tests/transformers/codegen/test_modeling.py b/tests/transformers/codegen/test_modeling.py
index 1ed627e9e54c..8e9913ee3325 100644
--- a/tests/transformers/codegen/test_modeling.py
+++ b/tests/transformers/codegen/test_modeling.py
@@ -417,6 +417,7 @@ class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin,
     use_test_model_name_list = False
     return_dict = False
     use_labels = False
+    use_test_inputs_embeds = True
 
     # attention mask issue
     def _get_input_ids_and_config(self):

From ed51fba0e17b5c26e77343bd0bdf2d306c4e681b Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 16 Nov 2022 08:52:23 +0000
Subject: [PATCH 05/10] add inputs_embeds to unimo

---
 paddlenlp/transformers/unimo/modeling.py  | 67 +++++++++++++++++------
 tests/transformers/unimo/test_modeling.py |  1 +
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/paddlenlp/transformers/unimo/modeling.py b/paddlenlp/transformers/unimo/modeling.py
index 958d042a208b..64d28c3f9296 100644
--- a/paddlenlp/transformers/unimo/modeling.py
+++ b/paddlenlp/transformers/unimo/modeling.py
@@ -17,7 +17,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.nn import TransformerEncoder
-
+from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import CausalLMOutputWithCrossAttentions
 
@@ -243,23 +243,41 @@ def __init__(self,
         self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
         self.pad_token_id = pad_token_id
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        input_embedings = self.word_embeddings(input_ids)
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                input_embeddings=None):
+        if input_ids is None and input_embeddings is None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_sample = input_ids
+        elif input_embeddings is not None:
+            inputs_sample = input_embeddings[:, :, -1]
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+        if input_embeddings is None:
+            input_embeddings = self.word_embeddings(input_ids)
 
         if position_ids is None:
             if self.pad_token_id is None:
                 position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(input_ids)[1],
-                                  dtype="int64"), input_ids)
+                    paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                  dtype="int64"), inputs_sample)
             else:
+                assert input_ids is not None, "position_ids or pad_token_ids" \
+                    " should be provided when input_embedds is specified"
                 num_pad = paddle.sum(
                     (input_ids == self.pad_token_id).astype("float32"),
                     axis=-1,
                     keepdim=True)
                 position_ids = F.relu(
                     paddle.expand_as(
-                        paddle.arange(end=paddle.shape(input_ids)[1],
-                                      dtype="float32"), input_ids) -
+                        paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                      dtype="float32"), inputs_sample) -
                     num_pad).astype("int64")
             position_ids.stop_gradient = True
         position_embeddings = self.position_embeddings(position_ids)
@@ -269,7 +287,7 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None):
             token_type_ids.stop_gradient = True
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        embeddings = input_embedings + position_embeddings + token_type_embeddings
+        embeddings = input_embeddings + position_embeddings + token_type_embeddings
         return embeddings
 
 
@@ -407,12 +425,13 @@ def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -458,6 +477,11 @@ def forward(self,
                 method. See :meth:`paddle.nn.TransformerEncoder.gen_cache`
                 method for more details. It is only used for inference and
                 should be None for training. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
             output_attentions (bool, optional):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                 tensors for more detail. Defaults to `False`.
@@ -489,13 +513,19 @@ def forward(self,
                 inputs = tokenizer.gen_encode("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors=True)
                 outputs = model(**inputs)
         """
+
         if attention_mask is None:
-            attention_mask = ((input_ids == self.pad_token_id).astype(
-                paddle.get_default_dtype()) * -1e4).unsqueeze([1, 2])
+            if input_ids is not None:
+                attention_mask = ((input_ids == self.pad_token_id).astype(
+                    paddle.get_default_dtype()) * -1e4).unsqueeze([1, 2])
+            else:
+                logger.warning("provided inputs_embeds without attention_mask")
+
+        if attention_mask is not None:
             attention_mask.stop_gradient = True
 
         embedding_output = self.embeddings(input_ids, token_type_ids,
-                                           position_ids)
+                                           position_ids, inputs_embeds)
 
         embedding_output = self.encoder_norm(embedding_output)
         embedding_output = self.dropout(embedding_output)
@@ -566,13 +596,14 @@ def __init__(self, unimo):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None,
                 masked_positions=None,
                 use_cache=False,
                 cache=None,
+                inputs_embeds=None,
                 labels=None,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -594,6 +625,8 @@ def forward(self,
                 See :class:`UNIMOModel`.
             cache (list, optional):
                 See :class:`UNIMOModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`UNIMOModel`.
             labels (Tensor, optional):
                 Labels for computing the left-to-right language modeling loss. Indices should be in
                 `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
@@ -638,13 +671,15 @@ def forward(self,
             attention_mask,
             use_cache,
             cache,
+            inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-
+        input_type = type(input_ids) if input_ids is not None else type(
+            inputs_embeds)
         sequence_output = outputs if isinstance(outputs,
-                                                type(input_ids)) else outputs[0]
+                                                input_type) else outputs[0]
 
         logits = self.lm_head(sequence_output, masked_positions)
 
@@ -656,7 +691,7 @@ def forward(self,
                 labels.reshape((-1, )))
 
         if not return_dict:
-            if isinstance(outputs, type(input_ids)):
+            if isinstance(outputs, input_type):
                 return (lm_loss, logits) if lm_loss is not None else logits
             else:
                 outputs = (logits, ) + outputs[1:]
diff --git a/tests/transformers/unimo/test_modeling.py b/tests/transformers/unimo/test_modeling.py
index 5c9a0a5c2af8..4378ab272c1f 100644
--- a/tests/transformers/unimo/test_modeling.py
+++ b/tests/transformers/unimo/test_modeling.py
@@ -444,6 +444,7 @@ class UNIMOModelTest(ModelTesterMixin, GenerationTesterMixin,
 
     use_labels = False
     return_dict = False
+    use_test_inputs_embeds = True
 
     # special case for DoubleHeads model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

From 89e1922c20ebb1efd5c3541ed5fe4e02e741c887 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 16 Nov 2022 09:22:53 +0000
Subject: [PATCH 06/10] add inputs_embeds to unified

---
 paddlenlp/transformers/codegen/modeling.py    |  4 +-
 .../unified_transformer/modeling.py           | 64 ++++++++++++++-----
 tests/transformers/mbart/test_modeling.py     |  2 +-
 .../unified_transformer/test_modeling.py      |  2 +-
 4 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py
index 04526a75672b..fd95d781c10b 100644
--- a/paddlenlp/transformers/codegen/modeling.py
+++ b/paddlenlp/transformers/codegen/modeling.py
@@ -519,9 +519,7 @@ def forward(
         else:
             past_length = cache[0][0].shape[-2]
 
-
-#TODO SLOW TEST
-# Attention mask.
+        # Attention mask.
         if attention_mask is None:
             if input_ids is not None:
                 if batch_size == 1 and past_length != 0:
diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py
index 696f2bf8ac89..de81c2e795e8 100644
--- a/paddlenlp/transformers/unified_transformer/modeling.py
+++ b/paddlenlp/transformers/unified_transformer/modeling.py
@@ -17,7 +17,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.nn import TransformerEncoder
-
+from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import CausalLMOutputWithCrossAttentions
 
@@ -177,28 +177,44 @@ def forward(self,
                 input_ids,
                 token_type_ids=None,
                 position_ids=None,
-                role_ids=None):
+                role_ids=None,
+                input_embeddings=None):
+        if input_ids is None and input_embeddings is None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_sample = input_ids
+        elif input_embeddings is not None:
+            inputs_sample = input_embeddings[:, :, -1]
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+        if input_embeddings is None:
+            input_embeddings = self.word_embeddings(input_ids)
+
         if position_ids is None:
             if self.pad_token_id is None:
                 position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(input_ids)[1],
-                                  dtype="int64"), input_ids)
+                    paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                  dtype="int64"), inputs_sample)
             else:
                 # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
                 # In that case, the position_ids must be provided.
                 # And this is for left padding input_ids.
+                assert input_ids is not None, "position_ids or pad_token_ids" \
+                    " should be provided when input_embedds is specified"
                 num_pad = paddle.sum(
                     (input_ids == self.pad_token_id).astype("float32"),
                     axis=-1,
                     keepdim=True)
                 position_ids = F.relu(
                     paddle.expand_as(
-                        paddle.arange(end=paddle.shape(input_ids)[1],
-                                      dtype="float32"), input_ids) -
+                        paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                      dtype="float32"), inputs_sample) -
                     num_pad).astype("int64")
             position_ids.stop_gradient = True
 
-        input_embedings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
         if token_type_ids is None:
@@ -206,7 +222,7 @@ def forward(self,
             token_type_ids.stop_gradient = True
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        embeddings = input_embedings + position_embeddings + token_type_embeddings
+        embeddings = input_embeddings + position_embeddings + token_type_embeddings
         # A model with role_embeddings can generate without role_ids.
         if role_ids is not None:
             embeddings += self.role_embeddings(role_ids)
@@ -338,13 +354,14 @@ def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None,
                 use_cache=False,
                 cache=None,
                 role_ids=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -396,6 +413,11 @@ def forward(self,
                 Indices of role ids indicated different roles.
                  It's data type should be `int64` and has a shape of 
                 [batch_size, sequence_length]. Defaults to None.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
             output_attentions (bool, optional):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                 tensors for more detail. Defaults to `False`.
@@ -432,15 +454,21 @@ def forward(self,
                     is_split_into_words=False)
                 outputs = model(**inputs)
         """
+
         if attention_mask is None:
-            attention_mask = ((input_ids == self.pad_token_id).astype(
-                paddle.get_default_dtype()) * -1e4).unsqueeze([1, 2])
+            if input_ids is not None:
+                attention_mask = ((input_ids == self.pad_token_id).astype(
+                    paddle.get_default_dtype()) * -1e4).unsqueeze([1, 2])
+            else:
+                logger.warning("provided inputs_embeds without attention_mask")
+        if attention_mask is not None:
             attention_mask.stop_gradient = True
 
         embedding_output = self.embeddings(input_ids,
                                            token_type_ids,
                                            position_ids,
-                                           role_ids=role_ids)
+                                           role_ids=role_ids,
+                                           input_embeddings=inputs_embeds)
         if use_cache and cache is None:
             cache = self.encoder.gen_cache(embedding_output)
 
@@ -509,7 +537,7 @@ def __init__(self, unified_transformer):
         self.apply(self.init_weights)
 
     def forward(self,
-                input_ids,
+                input_ids=None,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None,
@@ -518,6 +546,7 @@ def forward(self,
                 cache=None,
                 role_ids=None,
                 labels=None,
+                inputs_embeds=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -544,6 +573,8 @@ def forward(self,
                 Labels for computing the left-to-right language modeling loss. Indices should be in
                 `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
                 ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., vocab_size]`
+            inputs_embeds (Tensor, optional):
+                See :class:`UnifiedTransformerModel`.
             output_attentions (bool, optional):
                 See :class: `UnifiedTransformerModel`
             output_hidden_states (bool, optional):
@@ -586,12 +617,15 @@ def forward(self,
             use_cache,
             cache,
             role_ids=role_ids,
+            inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        input_type = type(input_ids) if input_ids is not None else type(
+            inputs_embeds)
         sequence_output = outputs if isinstance(outputs,
-                                                type(input_ids)) else outputs[0]
+                                                input_type) else outputs[0]
         logits = self.lm_head(sequence_output, masked_positions)
 
         lm_loss = None
@@ -600,7 +634,7 @@ def forward(self,
             lm_loss = loss_fct(logits.reshape((-1, logits.shape[-1])),
                                labels.reshape([-1]))
         if not return_dict:
-            if isinstance(outputs, type(input_ids)):
+            if isinstance(outputs, input_type):
                 return (lm_loss, logits) if lm_loss is not None else logits
             else:
                 outputs = (logits, ) + outputs[1:]
diff --git a/tests/transformers/mbart/test_modeling.py b/tests/transformers/mbart/test_modeling.py
index a954ca5ddef2..982da6b5d9f4 100644
--- a/tests/transformers/mbart/test_modeling.py
+++ b/tests/transformers/mbart/test_modeling.py
@@ -253,7 +253,7 @@ def test_decoder_model_past_with_large_inputs(self):
             *config_and_inputs)
 
     def test_inputs_embeds_for_mbart(self):
-        # rewrite test inputs embeds for mbart model since scaler not equal to 1.0
+        # NOTE: rewrite test inputs embeds for mbart model since scaler not equal to 1.0
         # get config for model and inputs_dict for model forward
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
         )
diff --git a/tests/transformers/unified_transformer/test_modeling.py b/tests/transformers/unified_transformer/test_modeling.py
index dae6ef41fa92..3759a08633ed 100644
--- a/tests/transformers/unified_transformer/test_modeling.py
+++ b/tests/transformers/unified_transformer/test_modeling.py
@@ -443,7 +443,7 @@ class UnifiedTransformerModelTest(ModelTesterMixin, GenerationTesterMixin,
         (UnifiedTransformerModel, "unified_transformer")
     }
     test_missing_keys = False
-
+    use_test_inputs_embeds = True
     use_labels = False
     return_dict = False
 

From 3ffba1babd37fc7fde6146e479ffee4bffb997b2 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 16 Nov 2022 11:46:01 +0000
Subject: [PATCH 07/10] change assertion to warning with default position_ids

---
 .../unified_transformer/modeling.py           | 32 +++++++++++--------
 paddlenlp/transformers/unimo/modeling.py      | 26 +++++++++------
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py
index de81c2e795e8..148e7fd719dc 100644
--- a/paddlenlp/transformers/unified_transformer/modeling.py
+++ b/paddlenlp/transformers/unified_transformer/modeling.py
@@ -199,20 +199,26 @@ def forward(self,
                     paddle.arange(end=paddle.shape(inputs_sample)[1],
                                   dtype="int64"), inputs_sample)
             else:
-                # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
-                # In that case, the position_ids must be provided.
-                # And this is for left padding input_ids.
-                assert input_ids is not None, "position_ids or pad_token_ids" \
-                    " should be provided when input_embedds is specified"
-                num_pad = paddle.sum(
-                    (input_ids == self.pad_token_id).astype("float32"),
-                    axis=-1,
-                    keepdim=True)
-                position_ids = F.relu(
-                    paddle.expand_as(
+                if input_ids is not None:
+                    # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
+                    # In that case, the position_ids must be provided.
+                    # And this is for left padding input_ids.
+                    num_pad = paddle.sum(
+                        (input_ids == self.pad_token_id).astype("float32"),
+                        axis=-1,
+                        keepdim=True)
+                    position_ids = F.relu(
+                        paddle.expand_as(
+                            paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                          dtype="float32"), inputs_sample) -
+                        num_pad).astype("int64")
+                else:
+                    logger.warning(
+                        "position_ids or pad_token_ids should be provided when input_embeds is specified, otherwise an unexpected result may be returned"
+                    )
+                    position_ids = paddle.expand_as(
                         paddle.arange(end=paddle.shape(inputs_sample)[1],
-                                      dtype="float32"), inputs_sample) -
-                    num_pad).astype("int64")
+                                      dtype="int64"), inputs_sample)
             position_ids.stop_gradient = True
 
         position_embeddings = self.position_embeddings(position_ids)
diff --git a/paddlenlp/transformers/unimo/modeling.py b/paddlenlp/transformers/unimo/modeling.py
index 64d28c3f9296..a7bee7ebe3e2 100644
--- a/paddlenlp/transformers/unimo/modeling.py
+++ b/paddlenlp/transformers/unimo/modeling.py
@@ -268,17 +268,23 @@ def forward(self,
                     paddle.arange(end=paddle.shape(inputs_sample)[1],
                                   dtype="int64"), inputs_sample)
             else:
-                assert input_ids is not None, "position_ids or pad_token_ids" \
-                    " should be provided when input_embedds is specified"
-                num_pad = paddle.sum(
-                    (input_ids == self.pad_token_id).astype("float32"),
-                    axis=-1,
-                    keepdim=True)
-                position_ids = F.relu(
-                    paddle.expand_as(
+                if input_ids is not None:
+                    num_pad = paddle.sum(
+                        (input_ids == self.pad_token_id).astype("float32"),
+                        axis=-1,
+                        keepdim=True)
+                    position_ids = F.relu(
+                        paddle.expand_as(
+                            paddle.arange(end=paddle.shape(inputs_sample)[1],
+                                          dtype="float32"), inputs_sample) -
+                        num_pad).astype("int64")
+                else:
+                    logger.warning(
+                        "position_ids or pad_token_ids should be provided when input_embeds is specified, otherwise an unexpected result may be returned"
+                    )
+                    position_ids = paddle.expand_as(
                         paddle.arange(end=paddle.shape(inputs_sample)[1],
-                                      dtype="float32"), inputs_sample) -
-                    num_pad).astype("int64")
+                                      dtype="int64"), inputs_sample)
             position_ids.stop_gradient = True
         position_embeddings = self.position_embeddings(position_ids)
 

From 15112d891c5bf3609181ff43dd8aba3b60c7b416 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Mon, 5 Dec 2022 04:56:06 +0000
Subject: [PATCH 08/10] check code style

---
 tests/transformers/bart/test_modeling.py      | 315 +++++------
 tests/transformers/codegen/test_modeling.py   | 369 +++++--------
 tests/transformers/mbart/test_modeling.py     | 364 +++++--------
 .../unified_transformer/test_modeling.py      | 512 ++++++++----------
 tests/transformers/unimo/test_modeling.py     | 469 +++++++---------
 5 files changed, 834 insertions(+), 1195 deletions(-)

diff --git a/tests/transformers/bart/test_modeling.py b/tests/transformers/bart/test_modeling.py
index 665b01bfc0e1..dfec8a7fca31 100644
--- a/tests/transformers/bart/test_modeling.py
+++ b/tests/transformers/bart/test_modeling.py
@@ -13,30 +13,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import tempfile
-import unittest
-import numpy as np
 import random
-from parameterized import parameterized_class
-
-from tests.testing_utils import slow
-
-from ..test_generation_utils import GenerationTesterMixin
-from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from paddlenlp.transformers.tokenizer_utils_base import PaddingStrategy, TruncationStrategy
+import unittest
 
+import numpy as np
 import paddle
+from parameterized import parameterized_class
 
 from paddlenlp.transformers import (
-    AutoModelForSequenceClassification,
     BartForConditionalGeneration,
     BartForQuestionAnswering,
     BartForSequenceClassification,
     BartModel,
     BartTokenizer,
 )
-from paddlenlp.transformers.bart.modeling import BartDecoder, BartEncoder, shift_tokens_right
+from paddlenlp.transformers.bart.modeling import shift_tokens_right
+from paddlenlp.transformers.tokenizer_utils_base import (
+    PaddingStrategy,
+    TruncationStrategy,
+)
+from tests.testing_utils import slow
+
+from ..test_generation_utils import GenerationTesterMixin
+from ..test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 def prepare_bart_inputs_dict(
@@ -50,13 +49,16 @@ def prepare_bart_inputs_dict(
     cross_attn_head_mask=None,
 ):
     if attention_mask is None:
-        attention_mask = paddle.cast(
-            input_ids == config["pad_token_id"],
-            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+        attention_mask = (
+            paddle.cast(input_ids == config["pad_token_id"], dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+        )
     if decoder_attention_mask is None:
-        decoder_attention_mask = paddle.cast(
-            decoder_input_ids == config["pad_token_id"],
-            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+        decoder_attention_mask = (
+            paddle.cast(decoder_input_ids == config["pad_token_id"], dtype=paddle.get_default_dtype()).unsqueeze(
+                [1, 2]
+            )
+            * -1e4
+        )
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
@@ -66,7 +68,6 @@ def prepare_bart_inputs_dict(
 
 
 class BartModelTester:
-
     def __init__(
         self,
         parent,
@@ -110,22 +111,14 @@ def __init__(
         self.forced_eos_token_id = None
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length],
-                               self.vocab_size,
-                               dtype="int64")
-        input_ids = paddle.clip(
-            ids_tensor([self.batch_size, self.seq_length],
-                       self.vocab_size,
-                       dtype="int64"), 3)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
+        input_ids = paddle.clip(ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64"), 3)
         input_ids[:, -1] = self.eos_token_id  # Eos Token
 
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length],
-                                       self.vocab_size,
-                                       dtype="int64")
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
 
         config = self.get_config()
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids,
-                                               decoder_input_ids)
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
         return config, inputs_dict
 
     def get_config(self):
@@ -151,8 +144,7 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    def create_and_check_decoder_model_past_large_inputs(
-            self, config, inputs_dict):
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         encoder = BartModel(**config).get_encoder()
         decoder = BartModel(**config).get_decoder()
 
@@ -160,80 +152,68 @@ def create_and_check_decoder_model_past_large_inputs(
         decoder.eval()
 
         input_ids = inputs_dict["input_ids"]
-        decoder_input_ids = paddle.zeros_like(
-            input_ids[:, :1],
-            dtype="int64") + BartModel(**config).decoder_start_token_id
+        decoder_input_ids = (
+            paddle.zeros_like(input_ids[:, :1], dtype="int64") + BartModel(**config).decoder_start_token_id
+        )
 
         attention_mask = inputs_dict["attention_mask"]
-        decoder_attention_mask = paddle.zeros([input_ids.shape[0], 1, 1, 1],
-                                              dtype=paddle.get_default_dtype())
+        decoder_attention_mask = paddle.zeros([input_ids.shape[0], 1, 1, 1], dtype=paddle.get_default_dtype())
 
-        encoder_output = encoder(input_ids,
-                                 attention_mask,
-                                 return_dict=self.parent.return_dict)
+        encoder_output = encoder(input_ids, attention_mask, return_dict=self.parent.return_dict)
         origin_cache = decoder.decoder.gen_cache(encoder_output)
-        outputs = decoder(decoder_input_ids,
-                          decoder_attention_mask,
-                          encoder_output,
-                          attention_mask,
-                          cache=origin_cache,
-                          return_dict=self.parent.return_dict)
+        outputs = decoder(
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            attention_mask,
+            cache=origin_cache,
+            return_dict=self.parent.return_dict,
+        )
 
         output, cache = outputs[:2]
 
         # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_attn_mask = paddle.zeros([self.batch_size, 1, 1, 3],
-                                      dtype=paddle.get_default_dtype())
+        next_tokens = ids_tensor((self.batch_size, 3), config["vocab_size"], dtype="int64")
+        next_attn_mask = paddle.zeros([self.batch_size, 1, 1, 3], dtype=paddle.get_default_dtype())
 
         # append to next input_ids and
-        next_input_ids = paddle.concat([decoder_input_ids, next_tokens],
-                                       axis=-1)
-        next_attention_mask = paddle.concat(
-            [decoder_attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = decoder(next_input_ids,
-                                      next_attention_mask,
-                                      encoder_output,
-                                      attention_mask,
-                                      return_dict=self.parent.return_dict)
+        next_input_ids = paddle.concat([decoder_input_ids, next_tokens], axis=-1)
+        next_attention_mask = paddle.concat([decoder_attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = decoder(
+            next_input_ids, next_attention_mask, encoder_output, attention_mask, return_dict=self.parent.return_dict
+        )
         if self.parent.return_dict:
             output_from_no_past = output_from_no_past[0]
-        output_from_past, _ = decoder(next_tokens,
-                                      next_attention_mask,
-                                      encoder_output,
-                                      attention_mask,
-                                      cache=cache,
-                                      return_dict=self.parent.return_dict)[:2]
+        output_from_past, _ = decoder(
+            next_tokens,
+            next_attention_mask,
+            encoder_output,
+            attention_mask,
+            cache=cache,
+            return_dict=self.parent.return_dict,
+        )[:2]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -3:,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, :,
-                                                  random_slice_idx].detach()
-
-        self.parent.assertTrue(
-            output_from_past_slice.shape[1] == next_tokens.shape[1])
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
-
-
-@parameterized_class(("return_dict", "use_labels"), [
-    [False, False],
-    [False, True],
-    [True, False],
-    [True, True],
-])
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+@parameterized_class(
+    ("return_dict", "use_labels"),
+    [
+        [False, False],
+        [False, True],
+        [True, False],
+        [True, True],
+    ],
+)
 class BartHeadTests(unittest.TestCase):
     vocab_size = 99
     use_labels = False
@@ -282,10 +262,7 @@ def test_sequence_classification_forward(self):
         num_labels = 2
         labels = _long_tensor([1] * batch_size) if self.use_labels else None
         model = BartForSequenceClassification(bart_model, num_labels=num_labels)
-        outputs = model(input_ids=input_ids,
-                        decoder_input_ids=input_ids,
-                        labels=labels,
-                        return_dict=self.return_dict)
+        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels, return_dict=self.return_dict)
         expected_shape = [batch_size, num_labels]
         if self.use_labels:
             self.assertIsInstance(outputs[0].item(), float)  # test loss
@@ -297,14 +274,15 @@ def test_sequence_classification_forward(self):
 
     def test_question_answering_forward(self):
         config, input_ids, batch_size = self._get_config_and_data()
-        sequence_labels = ids_tensor([batch_size],
-                                     2) if self.use_labels else None
+        sequence_labels = ids_tensor([batch_size], 2) if self.use_labels else None
         bart_model = BartModel(**config)
         model = BartForQuestionAnswering(bart_model)
-        outputs = model(input_ids=input_ids,
-                        start_positions=sequence_labels,
-                        end_positions=sequence_labels,
-                        return_dict=self.return_dict)
+        outputs = model(
+            input_ids=input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            return_dict=self.return_dict,
+        )
 
         if self.use_labels:
             loss, start_logits, end_logits = outputs[:3]
@@ -317,12 +295,9 @@ def test_question_answering_forward(self):
     def test_lm_forward(self):
         config, input_ids, batch_size = self._get_config_and_data()
         bart_model = BartModel(**config)
-        lm_labels = ids_tensor([batch_size, input_ids.shape[1]],
-                               self.vocab_size) if self.use_labels else None
+        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size) if self.use_labels else None
         lm_model = BartForConditionalGeneration(bart_model)
-        outputs = lm_model(input_ids=input_ids,
-                           labels=lm_labels,
-                           return_dict=self.return_dict)
+        outputs = lm_model(input_ids=input_ids, labels=lm_labels, return_dict=self.return_dict)
         expected_shape = [batch_size, input_ids.shape[1], config["vocab_size"]]
         if self.use_labels:
             self.assertIsInstance(outputs[0].item(), float)
@@ -346,15 +321,14 @@ def test_lm_uneven_forward(self):
         }
         bart_model = BartModel(**config)
         lm_model = BartForConditionalGeneration(bart_model)
-        context = paddle.to_tensor(
-            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]],
-            dtype="int64")
-        summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]],
-                                   dtype="int64")
-        outputs = lm_model(input_ids=context,
-                           decoder_input_ids=summary,
-                           labels=summary if self.use_labels else None,
-                           return_dict=self.return_dict)
+        context = paddle.to_tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype="int64")
+        summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype="int64")
+        outputs = lm_model(
+            input_ids=context,
+            decoder_input_ids=summary,
+            labels=summary if self.use_labels else None,
+            return_dict=self.return_dict,
+        )
         expected_shape = summary.shape
         expected_shape.append(config["vocab_size"])
         if self.use_labels:
@@ -395,9 +369,7 @@ def test_generate_beam_search(self):
         self.assertEqual(generated_ids.shape, [input_ids.shape[0], max_length])
 
     def test_shift_tokens_right(self):
-        input_ids = paddle.to_tensor(
-            [[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]],
-            dtype="int64")
+        input_ids = paddle.to_tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype="int64")
         shifted = shift_tokens_right(input_ids, 2)
         n_pad_before = paddle.equal(input_ids, 1).sum().numpy()
         n_pad_after = paddle.equal(shifted, 1).sum().numpy()
@@ -408,28 +380,27 @@ def test_shift_tokens_right(self):
     @slow
     def test_tokenization(self):
         tokenizer = BartTokenizer.from_pretrained("bart-large")
-        examples = [" Hello world",
-                    " DomDramg"]  # need leading spaces for equality
+        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
         fairseq_results = [
             paddle.to_tensor([0, 20920, 232, 2]),
             paddle.to_tensor([0, 11349, 495, 4040, 571, 2]),
         ]
         for ex, desired_result in zip(examples, fairseq_results):
-            bart_toks = tokenizer.encode(
-                ex, return_tensors="pd")["input_ids"].squeeze()
+            bart_toks = tokenizer.encode(ex, return_tensors="pd")["input_ids"].squeeze()
             assert_tensors_close(desired_result, bart_toks, prefix=ex)
 
 
 class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = BartModel
 
-    all_model_classes = (BartModel, BartForConditionalGeneration,
-                         BartForSequenceClassification,
-                         BartForQuestionAnswering)
+    all_model_classes = (
+        BartModel,
+        BartForConditionalGeneration,
+        BartForSequenceClassification,
+        BartForQuestionAnswering,
+    )
 
-    all_generative_model_classes = {
-        BartForConditionalGeneration: (BartModel, "bart")
-    }
+    all_generative_model_classes = {BartForConditionalGeneration: (BartModel, "bart")}
     is_encoder_decoder = True
     fx_compatible = True
     test_pruning = False
@@ -446,8 +417,7 @@ def setUp(self):
 
     def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs)
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
@@ -524,17 +494,12 @@ def test_bart_base_generation(self):
             " will include alleged war crimes committed since June. The International Criminal Court was set up in"
             " 2002 to prosecute genocide, crimes against humanity and war crimes."
         )
-        EXPECTED = (
-            'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
-        )
+        EXPECTED = 'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
 
         dct = tok(ARTICLE, return_tensors="pd")
 
         dct.pop("token_type_ids")
-        generated_ids, _ = model.generate(**dct,
-                                          num_beams=4,
-                                          decode_strategy="beam_search",
-                                          max_length=1024)
+        generated_ids, _ = model.generate(**dct, num_beams=4, decode_strategy="beam_search", max_length=1024)
         result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
         assert EXPECTED == result, f"{EXPECTED}\n{result}"
 
@@ -660,23 +625,19 @@ def test_xsum_1_1_batch_generation(self):
         model = self.bart_base()
         model.eval()
 
-        generated_ids, _ = model.generate(**batch,
-                                          num_beams=4,
-                                          decode_strategy="beam_search")
-        result = self.tok().batch_decode(generated_ids,
-                                         skip_special_tokens=True)
+        generated_ids, _ = model.generate(**batch, num_beams=4, decode_strategy="beam_search")
+        result = self.tok().batch_decode(generated_ids, skip_special_tokens=True)
         assert (
-            result[0] ==
-            "The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a"
+            result[0]
+            == "The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a"
         )
         assert (
-            result[1] ==
-            "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that"
+            result[1]
+            == "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that"
         )
 
 
 class BartModelIntegrationTests(unittest.TestCase):
-
     def default_tokenizer(self):
         return BartTokenizer.from_pretrained("bart-large")
 
@@ -684,13 +645,12 @@ def default_tokenizer(self):
     def test_inference_no_head(self):
         model = BartModel.from_pretrained("bart-large")
         model.eval()
-        input_ids = paddle.to_tensor(
-            [[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]],
-            dtype="int64")
+        input_ids = paddle.to_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]], dtype="int64")
 
-        attention_mask = paddle.cast(
-            input_ids == model.config["pad_token_id"],
-            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+        attention_mask = (
+            paddle.cast(input_ids == model.config["pad_token_id"], dtype=paddle.get_default_dtype()).unsqueeze([1, 2])
+            * -1e4
+        )
         with paddle.no_grad():
             output = model(input_ids=input_ids, attention_mask=attention_mask)
         expected_shape = [1, 11, 1024]
@@ -774,7 +734,8 @@ def test_cnn_summarization_same_as_fairseq(self):
             ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
             " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
             " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report.")
+            " Amiel and Anna-Maja Rappard contributed to this report."
+        )
 
         SHORTER_ARTICLE = (
             " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
@@ -812,7 +773,8 @@ def test_cnn_summarization_same_as_fairseq(self):
             " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
             " will include alleged war crimes committed since June. The International Criminal Court was set up in"
             " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report.")
+            " and Faith Karimi contributed to this report."
+        )
 
         # The below article tests that we don't add any hypotheses outside of the top n_beams
         IRAN_ARTICLE = (
@@ -911,25 +873,22 @@ def test_cnn_summarization_same_as_fairseq(self):
             max_length=1024,
         )
 
-        EXPECTED = [
-            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
-            "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
-            "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
-            "9525 were killed.",
-            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
-            "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
-            "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
-            "move toward greater justice.",
-            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
-            "debate that has already begun will likely result in more heat than light. He says critics have made "
-            "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
-            "nuclear weapon.",
-            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
-            "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
-            "Bronx on Friday. If convicted, she faces up to four years in prison.",
-        ]
-
-        generated_summaries = tok.batch_decode(
-            hypotheses_batch.tolist(),
-            clean_up_tokenization_spaces=True,
-            skip_special_tokens=True)
+        # EXPECTED = [
+        #     "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
+        #     "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
+        #     "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
+        #     "9525 were killed.",
+        #     "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
+        #     "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
+        #     "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
+        #     "move toward greater justice.",
+        #     "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
+        #     "debate that has already begun will likely result in more heat than light. He says critics have made "
+        #     "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
+        #     "nuclear weapon.",
+        #     "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
+        #     "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
+        #     "Bronx on Friday. If convicted, she faces up to four years in prison.",
+        # ]
+
+        tok.batch_decode(hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True)
diff --git a/tests/transformers/codegen/test_modeling.py b/tests/transformers/codegen/test_modeling.py
index 8e9913ee3325..0912486c8a91 100644
--- a/tests/transformers/codegen/test_modeling.py
+++ b/tests/transformers/codegen/test_modeling.py
@@ -13,20 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import datetime
-import unittest
-import numpy as np
 import random
+import unittest
 
+import numpy as np
 import paddle
-from paddlenlp.transformers import (CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
-                                    AutoTokenizer, CodeGenForCausalLM,
-                                    CodeGenModel, CodeGenTokenizer)
-from ...testing_utils import slow
+from parameterized import parameterized_class
+
+from paddlenlp.transformers import (
+    CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
+    AutoTokenizer,
+    CodeGenForCausalLM,
+    CodeGenModel,
+)
 
+from ...testing_utils import slow
 from ..test_generation_utils import GenerationTesterMixin
-from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from parameterized import parameterized_class
+from ..test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
 
 
 class CodeGenModelTester:
@@ -87,34 +95,23 @@ def __init__(
         random.seed(128)
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length],
-                               self.vocab_size,
-                               dtype="int64")
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask(
-                [self.batch_size, self.seq_length], dtype="int64")
+            input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype="int64")
 
         mc_token_ids = None
         if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices],
-                                      self.seq_length,
-                                      dtype="int64")
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length, dtype="int64")
 
         sequence_labels = None
         token_labels = None
         choice_labels = None
         if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size],
-                                         self.type_sequence_label_size,
-                                         dtype="int64")
-            token_labels = ids_tensor([self.batch_size, self.seq_length],
-                                      self.num_labels,
-                                      dtype="int64")
-            choice_labels = ids_tensor([self.batch_size],
-                                       self.num_choices,
-                                       dtype="int64")
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size, dtype="int64")
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels, dtype="int64")
+            choice_labels = ids_tensor([self.batch_size], self.num_choices, dtype="int64")
 
         config = self.get_config()
 
@@ -157,11 +154,8 @@ def prepare_config_and_inputs_for_decoder(self):
             choice_labels,
         ) = self.prepare_config_and_inputs()
 
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length],
-                                            vocab_size=2,
-                                            dtype="int64")
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype="int64")
 
         return (
             config,
@@ -174,69 +168,45 @@ def prepare_config_and_inputs_for_decoder(self):
             encoder_attention_mask,
         )
 
-    def create_and_check_codegen_model(self, config, input_ids, input_mask,
-                                       *args):
+    def create_and_check_codegen_model(self, config, input_ids, input_mask, *args):
         model = CodeGenModel(**config)
         model.eval()
 
-        result = model(input_ids,
-                       use_cache=True,
-                       return_dict=self.parent.return_dict)
+        result = model(input_ids, use_cache=True, return_dict=self.parent.return_dict)
 
-        self.parent.assertEqual(
-            result[0].shape,
-            [self.batch_size, self.seq_length, self.hidden_size])
+        self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size])
         self.parent.assertEqual(len(result[1]), config["n_layer"])
 
-    def create_and_check_codegen_model_past(self, config, input_ids, input_mask,
-                                            *args):
+    def create_and_check_codegen_model_past(self, config, input_ids, input_mask, *args):
         model = CodeGenModel(**config)
         model.eval()
 
         # first forward pass
-        outputs = model(input_ids,
-                        use_cache=True,
-                        return_dict=self.parent.return_dict)
-        outputs_no_past = model(input_ids,
-                                use_cache=False,
-                                return_dict=self.parent.return_dict)
+        outputs = model(input_ids, use_cache=True, return_dict=self.parent.return_dict)
+        outputs_no_past = model(input_ids, use_cache=False, return_dict=self.parent.return_dict)
 
         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
 
         output, past = outputs[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
 
         # append to next input_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
 
-        output_from_no_past = model(next_input_ids,
-                                    return_dict=self.parent.return_dict)[0]
-        output_from_past = model(next_tokens,
-                                 cache=past,
-                                 return_dict=self.parent.return_dict)[0]
+        output_from_no_past = model(next_input_ids, return_dict=self.parent.return_dict)[0]
+        output_from_past = model(next_tokens, cache=past, return_dict=self.parent.return_dict)[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -1,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
-    def create_and_check_codegen_model_attention_mask_past(
-            self, config, input_ids, input_mask, *args):
+    def create_and_check_codegen_model_attention_mask_past(self, config, input_ids, input_mask, *args):
         model = CodeGenModel(**config)
         model.eval()
 
@@ -246,136 +216,94 @@ def create_and_check_codegen_model_attention_mask_past(
         attn_mask[:, half_seq_length:] = 0
 
         # first forward pass
-        output, past = model(input_ids,
-                             attention_mask=attn_mask,
-                             use_cache=True,
-                             return_dict=self.parent.return_dict)[:2]
+        output, past = model(input_ids, attention_mask=attn_mask, use_cache=True, return_dict=self.parent.return_dict)[
+            :2
+        ]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
 
         # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor(
-            (1, ), half_seq_length, dtype="int64").item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1),
-                                              config["vocab_size"],
-                                              dtype="int64").squeeze(-1)
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length, dtype="int64").item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64").squeeze(-1)
         input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
 
         # append to next input_ids and attn_mask
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
         attn_mask = paddle.concat(
-            [attn_mask,
-             paddle.ones((attn_mask.shape[0], 1), dtype="int64")],
+            [attn_mask, paddle.ones((attn_mask.shape[0], 1), dtype="int64")],
             axis=1,
         )
 
         # get two different outputs
-        output_from_no_past = model(next_input_ids,
-                                    attention_mask=attn_mask,
-                                    return_dict=self.parent.return_dict)[0]
-        output_from_past = model(next_tokens,
-                                 cache=past,
-                                 attention_mask=attn_mask,
-                                 return_dict=self.parent.return_dict)[0]
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask, return_dict=self.parent.return_dict)[0]
+        output_from_past = model(
+            next_tokens, cache=past, attention_mask=attn_mask, return_dict=self.parent.return_dict
+        )[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -1,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
-    def create_and_check_codegen_model_past_large_inputs(
-            self, config, input_ids, input_mask, *args):
+    def create_and_check_codegen_model_past_large_inputs(self, config, input_ids, input_mask, *args):
         model = CodeGenModel(**config)
         model.eval()
 
         # first forward pass
-        outputs = model(input_ids,
-                        attention_mask=input_mask,
-                        use_cache=True,
-                        return_dict=self.parent.return_dict)
+        outputs = model(input_ids, attention_mask=input_mask, use_cache=True, return_dict=self.parent.return_dict)
 
         output, past = outputs[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_mask = ids_tensor((self.batch_size, 3),
-                               vocab_size=2,
-                               dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 3), config["vocab_size"], dtype="int64")
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2, dtype="int64")
 
         # append to next input_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
         next_attention_mask = paddle.concat([input_mask, next_mask], axis=-1)
 
-        output_from_no_past = model(next_input_ids,
-                                    attention_mask=next_attention_mask,
-                                    return_dict=self.parent.return_dict)[0]
-        output_from_past = model(next_tokens,
-                                 attention_mask=next_attention_mask,
-                                 cache=past,
-                                 return_dict=self.parent.return_dict)[0]
-        self.parent.assertTrue(
-            output_from_past.shape[1] == next_tokens.shape[1])
+        output_from_no_past = model(
+            next_input_ids, attention_mask=next_attention_mask, return_dict=self.parent.return_dict
+        )[0]
+        output_from_past = model(
+            next_tokens, attention_mask=next_attention_mask, cache=past, return_dict=self.parent.return_dict
+        )[0]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -3:,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, :,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask,
-                                       *args):
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
         base_model = CodeGenModel(**config)
         model = CodeGenForCausalLM(base_model)
 
-        outputs = model(input_ids,
-                        labels=input_ids if self.parent.use_labels else None,
-                        return_dict=self.parent.return_dict)
+        outputs = model(
+            input_ids, labels=input_ids if self.parent.use_labels else None, return_dict=self.parent.return_dict
+        )
         if self.parent.use_labels:
             loss, logits = outputs[:2]
             self.parent.assertEqual(loss.shape, [1])
         else:
             logits = outputs[0]
-        self.parent.assertEqual(
-            logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        self.parent.assertEqual(logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
 
-    def create_and_check_forward_and_backwards(self, config, input_ids,
-                                               input_mask, *args):
+    def create_and_check_forward_and_backwards(self, config, input_ids, input_mask, *args):
         base_model = CodeGenModel(**config)
         model = CodeGenForCausalLM(base_model)
 
-        loss, logits = model(input_ids,
-                             return_dict=self.parent.return_dict,
-                             labels=input_ids)[:2]
+        loss, logits = model(input_ids, return_dict=self.parent.return_dict, labels=input_ids)[:2]
         self.parent.assertEqual(loss.shape, [1])
-        self.parent.assertEqual(
-            logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
-        result.loss.backward()
+        self.parent.assertEqual(logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        loss.backward()
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -395,20 +323,20 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
-@parameterized_class(("return_dict", ), [
-    [False, False],
-    [False, True],
-    [True, False],
-    [True, True],
-])
-class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin,
-                       unittest.TestCase):
+@parameterized_class(
+    ("return_dict",),
+    [
+        [False, False],
+        [False, True],
+        [True, False],
+        [True, True],
+    ],
+)
+class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = CodeGenModel
 
     all_model_classes = (CodeGenModel, CodeGenForCausalLM)
-    all_generative_model_classes = {
-        CodeGenForCausalLM: (CodeGenModel, "transformer")
-    }
+    all_generative_model_classes = {CodeGenForCausalLM: (CodeGenModel, "transformer")}
     fx_compatible = False
     test_pruning = False
     test_missing_keys = False
@@ -421,8 +349,7 @@ class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin,
 
     # attention mask issue
     def _get_input_ids_and_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         input_ids = inputs_dict[self.input_name]
         attention_mask = paddle.zeros_like(input_ids, dtype=paddle.float32)
@@ -430,15 +357,12 @@ def _get_input_ids_and_config(self):
         max_batch_size = 2
         sequence_length = input_ids.shape[-1] // 2
         input_ids = input_ids[:max_batch_size, :sequence_length]
-        attention_mask = attention_mask[:max_batch_size, :
-                                        sequence_length].unsqueeze([1, 2])
+        attention_mask = attention_mask[:max_batch_size, :sequence_length].unsqueeze([1, 2])
 
         # generate max 3 tokens
         max_length = 3
 
-        if config.get(
-                "eos_token_id",
-                None) is not None and config.get("pad_token_id", None) is None:
+        if config.get("eos_token_id", None) is not None and config.get("pad_token_id", None) is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config["pad_token_id"] = config["eos_token_id"]
 
@@ -458,18 +382,15 @@ def test_codegen_model(self):
 
     def test_codegen_model_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_past(
-            *config_and_inputs)
+        self.model_tester.create_and_check_codegen_model_past(*config_and_inputs)
 
     def test_codegen_model_att_mask_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_attention_mask_past(
-            *config_and_inputs)
+        self.model_tester.create_and_check_codegen_model_attention_mask_past(*config_and_inputs)
 
     def test_codegen_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_past_large_inputs(
-            *config_and_inputs)
+        self.model_tester.create_and_check_codegen_model_past_large_inputs(*config_and_inputs)
 
     def test_codegen_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -477,26 +398,20 @@ def test_codegen_lm_head_model(self):
 
     @slow
     def test_batch_generation(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained(
-            "Salesforce/codegen-350M-mono")
+        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
         model.eval()
 
         tokenizer.padding_side = "left"
 
         # Define PAD Token = EOS Token = 50256
         tokenizer.pad_token = tokenizer.eos_token
-        model.transformer.config["pad_token_id"] = model.transformer.config[
-            "eos_token_id"]
+        model.transformer.config["pad_token_id"] = model.transformer.config["eos_token_id"]
 
         # use different length sentences to test batching
         sentences = ["def hellow_world():", "def greet(name):"]
 
-        inputs = tokenizer(sentences,
-                           return_tensors="pd",
-                           padding=True,
-                           return_attention_mask=True)
+        inputs = tokenizer(sentences, return_tensors="pd", padding=True, return_attention_mask=True)
         input_ids = inputs["input_ids"]
 
         outputs, _ = model.generate(
@@ -504,21 +419,16 @@ def test_batch_generation(self):
             attention_mask=inputs["attention_mask"],
         )
 
-        inputs_non_padded = tokenizer(sentences[0],
-                                      return_tensors="pd")["input_ids"]
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pd")["input_ids"]
         output_non_padded, _ = model.generate(input_ids=inputs_non_padded)
 
-        inputs_padded = tokenizer(sentences[1],
-                                  return_tensors="pd")["input_ids"]
+        inputs_padded = tokenizer(sentences[1], return_tensors="pd")["input_ids"]
         output_padded, _ = model.generate(input_ids=inputs_padded)
 
-        batch_out_sentence = tokenizer.batch_decode(outputs,
-                                                    skip_special_tokens=True)
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
-        non_padded_sentence = tokenizer.decode(output_non_padded[0],
-                                               skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0],
-                                           skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
             '\n      print("Hello World")\n\nhellow_world()\n\n#',
@@ -526,8 +436,7 @@ def test_batch_generation(self):
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
 
-        self.assertListEqual(expected_output_sentence,
-                             [non_padded_sentence, padded_sentence])
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
 
     @slow
     def test_model_from_pretrained(self):
@@ -542,70 +451,56 @@ def test_model_name_list(self):
     @slow
     def test_auto_tokenizer(self):
         for model_name in CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            AutoTokenizer.from_pretrained(model_name)
 
 
 class CodeGenModelLanguageGenerationTest(unittest.TestCase):
-
     @slow
     def test_lm_generate_codegen(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained(
-            "Salesforce/codegen-350M-mono")
+        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
         model.eval()
 
-        inputs = tokenizer("def hello_world():",
-                           return_tensors="pd",
-                           return_attention_mask=True,
-                           return_token_type_ids=False)
+        inputs = tokenizer(
+            "def hello_world():", return_tensors="pd", return_attention_mask=True, return_token_type_ids=False
+        )
         expected_output = '\n      print("Hello World")\n\nhello_world()\n\n#'
 
-        output_ids, _ = model.generate(**inputs,
-                                       decode_strategy="sampling",
-                                       top_k=1)
+        output_ids, _ = model.generate(**inputs, decode_strategy="sampling", top_k=1)
         output_str = tokenizer.batch_decode(output_ids)[0]
 
         self.assertEqual(output_str, expected_output)
 
     @slow
     def test_codegen_sample(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained(
-            "Salesforce/codegen-350M-mono")
+        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
         model.eval()
 
-        tokenized = tokenizer("def hello_world():",
-                              return_tensors="pd",
-                              return_token_type_ids=True,
-                              return_attention_mask=True)
+        tokenized = tokenizer(
+            "def hello_world():", return_tensors="pd", return_token_type_ids=True, return_attention_mask=True
+        )
         input_ids = tokenized["input_ids"]
-        output_ids, _ = model.generate(input_ids,
-                                       decode_strategy="sampling",
-                                       top_k=1)
+        output_ids, _ = model.generate(input_ids, decode_strategy="sampling", top_k=1)
         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
         token_type_ids = tokenized.token_type_ids
-        output_seq, _ = model.generate(input_ids=input_ids,
-                                       decode_strategy="sampling",
-                                       top_k=1,
-                                       num_return_sequences=5)
-        output_seq_tt, _ = model.generate(input_ids=input_ids,
-                                          token_type_ids=token_type_ids,
-                                          decode_strategy="sampling",
-                                          top_k=1,
-                                          num_return_sequences=5)
-        output_seq_strs = tokenizer.batch_decode(output_seq,
-                                                 skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt,
-                                                    skip_special_tokens=True)
+        output_seq, _ = model.generate(
+            input_ids=input_ids, decode_strategy="sampling", top_k=1, num_return_sequences=5
+        )
+        output_seq_tt, _ = model.generate(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            decode_strategy="sampling",
+            top_k=1,
+            num_return_sequences=5,
+        )
+        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
+        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
 
         EXPECTED_OUTPUT_STR = '\n      print("Hello World")\n\nhello_world()\n\n#'
 
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([
-                output_seq_strs[idx] != output_seq_tt_strs[idx]
-                for idx in range(len(output_seq_tt_strs))
-            ]))  # token_type_ids should change output
+            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+        )  # token_type_ids should change output
diff --git a/tests/transformers/mbart/test_modeling.py b/tests/transformers/mbart/test_modeling.py
index 982da6b5d9f4..c2fda789587b 100644
--- a/tests/transformers/mbart/test_modeling.py
+++ b/tests/transformers/mbart/test_modeling.py
@@ -13,15 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import copy
-from tests.testing_utils import slow, PaddleNLPModelTest
-
-from ..test_generation_utils import GenerationTesterMixin
-from ..test_modeling_common import ModelTesterMixin, ids_tensor
-from parameterized import parameterized_class
+import tempfile
 
 import paddle
+from parameterized import parameterized_class
 
 from paddlenlp.transformers import (
     AutoTokenizer,
@@ -30,7 +26,11 @@
     MBartForSequenceClassification,
     MBartModel,
 )
-from paddlenlp.transformers.mbart.modeling import MBartDecoder, MBartEncoder
+from paddlenlp.transformers.mbart.modeling import MBartDecoder
+from tests.testing_utils import PaddleNLPModelTest, slow
+
+from ..test_generation_utils import GenerationTesterMixin
+from ..test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 def prepare_mbart_inputs_dict(
@@ -41,11 +41,11 @@ def prepare_mbart_inputs_dict(
     decoder_attention_mask=None,
 ):
     if attention_mask is None:
-        attention_mask = (input_ids == config["pad_token_id"]
-                          ).astype("float32").unsqueeze([1, 2]) * -1e4
+        attention_mask = (input_ids == config["pad_token_id"]).astype("float32").unsqueeze([1, 2]) * -1e4
     if decoder_attention_mask is None:
-        decoder_attention_mask = (decoder_input_ids == config["pad_token_id"]
-                                  ).astype("float32").unsqueeze([1, 2]) * -1e4
+        decoder_attention_mask = (decoder_input_ids == config["pad_token_id"]).astype("float32").unsqueeze(
+            [1, 2]
+        ) * -1e4
     return {
         "input_ids": input_ids,
         "decoder_input_ids": decoder_input_ids,
@@ -55,7 +55,6 @@ def prepare_mbart_inputs_dict(
 
 
 class MBartModelTester:
-
     def __init__(
         self,
         parent,
@@ -108,22 +107,14 @@ def __init__(
         self.forced_eos_token_id = None
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length],
-                               self.vocab_size,
-                               dtype="int64")
-        input_ids = paddle.clip(
-            ids_tensor([self.batch_size, self.seq_length],
-                       self.vocab_size,
-                       dtype="int64"), 3)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
+        input_ids = paddle.clip(ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64"), 3)
         input_ids[:, -1] = self.eos_token_id  # Eos Token
 
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length],
-                                       self.vocab_size,
-                                       dtype="int64")
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
 
         config = self.get_config()
-        inputs_dict = prepare_mbart_inputs_dict(config, input_ids,
-                                                decoder_input_ids)
+        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
         return config, inputs_dict
 
     def get_config(self):
@@ -153,84 +144,69 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    def create_and_check_decoder_model_past_large_inputs(
-            self, config, inputs_dict):
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = MBartModel(**config).get_decoder()
         model.eval()
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
 
         cache = model.decoder.gen_cache(
-            paddle.randn(shape=[
-                input_ids.shape[0], input_ids.shape[1], config["d_model"]
-            ]))
+            paddle.randn(shape=[input_ids.shape[0], input_ids.shape[1], config["d_model"]])
+        )
 
         # first forward pass
-        outputs = model(input_ids,
-                        decoder_attention_mask=attention_mask,
-                        cache=cache,
-                        return_dict=self.parent.return_dict)
+        outputs = model(
+            input_ids, decoder_attention_mask=attention_mask, cache=cache, return_dict=self.parent.return_dict
+        )
 
         output, past_key_values = outputs[:2]
 
         # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_attn_mask = (1 - ids_tensor(
-            (self.batch_size, 3), 2, dtype="int64").unsqueeze(
-                [1, 2])).astype("float32") * -1e4
+        next_tokens = ids_tensor((self.batch_size, 3), config["vocab_size"], dtype="int64")
+        next_attn_mask = (1 - ids_tensor((self.batch_size, 3), 2, dtype="int64").unsqueeze([1, 2])).astype(
+            "float32"
+        ) * -1e4
 
         # append to next input_ids and
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = paddle.concat([attention_mask, next_attn_mask],
-                                            axis=-1)
+        next_attention_mask = paddle.concat([attention_mask, next_attn_mask], axis=-1)
 
-        output_from_no_past = model(next_input_ids,
-                                    decoder_attention_mask=next_attention_mask,
-                                    cache=None,
-                                    return_dict=self.parent.return_dict)
+        output_from_no_past = model(
+            next_input_ids, decoder_attention_mask=next_attention_mask, cache=None, return_dict=self.parent.return_dict
+        )
         if self.parent.return_dict:
             output_from_no_past = output_from_no_past[0]
-        output_from_past = model(next_tokens,
-                                 decoder_attention_mask=next_attention_mask,
-                                 cache=past_key_values)[0]
+        output_from_past = model(next_tokens, decoder_attention_mask=next_attention_mask, cache=past_key_values)[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -3:,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, :,
-                                                  random_slice_idx].detach()
-
-        self.parent.assertTrue(
-            output_from_past_slice.shape[1] == next_tokens.shape[1])
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
-
-
-@parameterized_class(("return_dict", ), [
-    [False],
-    [True],
-])
-class MBartModelTest(ModelTesterMixin, GenerationTesterMixin,
-                     PaddleNLPModelTest):
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+@parameterized_class(
+    ("return_dict",),
+    [
+        [False],
+        [True],
+    ],
+)
+class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, PaddleNLPModelTest):
     base_model_class = MBartModel
 
-    all_model_classes = (MBartModel, MBartForConditionalGeneration,
-                         MBartForSequenceClassification,
-                         MBartForQuestionAnswering)
+    all_model_classes = (
+        MBartModel,
+        MBartForConditionalGeneration,
+        MBartForSequenceClassification,
+        MBartForQuestionAnswering,
+    )
 
-    all_generative_model_classes = {
-        MBartForConditionalGeneration: (MBartModel, "mbart")
-    }
+    all_generative_model_classes = {MBartForConditionalGeneration: (MBartModel, "mbart")}
     is_encoder_decoder = True
     test_missing_keys = False
     return_dict = False
@@ -245,26 +221,23 @@ def test_save_load_strict(self):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
-                model2 = model_class.from_pretrained(tmpdirname)
+                model_class.from_pretrained(tmpdirname)
 
     def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs)
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
     def test_inputs_embeds_for_mbart(self):
         # NOTE: rewrite test inputs embeds for mbart model since scaler not equal to 1.0
         # get config for model and inputs_dict for model forward
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
-        scaler = config["d_model"]**0.5
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        scaler = config["d_model"] ** 0.5
         # test all model classes
         for model_class in self.all_model_classes:
             model = self._make_model_instance(config, model_class)
             model.eval()
 
-            inputs = copy.deepcopy(
-                self._prepare_for_class(inputs_dict, model_class))
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
 
             with paddle.no_grad():
                 ids_output = model(**inputs)
@@ -274,8 +247,7 @@ def test_inputs_embeds_for_mbart(self):
                 del inputs["input_ids"]
             else:
                 encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids",
-                                               encoder_input_ids)
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
                 del inputs["input_ids"]
                 inputs.pop("decoder_input_ids", None)
 
@@ -284,15 +256,12 @@ def test_inputs_embeds_for_mbart(self):
                 inputs["inputs_embeds"] = wte(input_ids) * scaler
             else:
                 inputs["inputs_embeds"] = wte(encoder_input_ids) * scaler
-                inputs["decoder_inputs_embeds"] = wte(
-                    decoder_input_ids) * scaler
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) * scaler
 
             with paddle.no_grad():
                 embeds_output = model(**inputs)
 
-            self.assertTrue(
-                paddle.allclose(ids_output, embeds_output, rtol=1e-4,
-                                atol=1e-4))
+            self.assertTrue(paddle.allclose(ids_output, embeds_output, rtol=1e-4, atol=1e-4))
 
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
@@ -304,8 +273,7 @@ def assert_tensors_close(a, b, atol=1e-12, prefix=""):
             return True
         raise
     except Exception:
-        pct_different = (paddle.greater_than((a - b).abs(),
-                                             atol)).float().mean().item()
+        pct_different = (paddle.greater_than((a - b).abs(), atol)).float().mean().item()
         if a.numel() > 100:
             msg = f"tensor values are {pct_different:.1%} percent different."
         else:
@@ -330,16 +298,18 @@ def setUpClass(cls):
 
     def model(self):
         """Only load the model if needed."""
-        model = MBartForConditionalGeneration.from_pretrained(
-            self.checkpoint_name)
+        model = MBartForConditionalGeneration.from_pretrained(self.checkpoint_name)
         model.eval()
         return model
 
 
-@parameterized_class(("return_dict", ), [
-    [False],
-    [True],
-])
+@parameterized_class(
+    ("return_dict",),
+    [
+        [False],
+        [True],
+    ],
+)
 class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
     checkpoint_name = "mbart-large-en-ro"
     src_text = [
@@ -347,40 +317,30 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
         """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
     ]
     tgt_text = [
-        'Şeful ONU declară că nu există o soluţie militară în Siria',
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
         'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar acordat de Rusia Siriei este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria a milioane de oameni.',
     ]
-    expected_src_tokens = [
-        8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712,
-        2, 250004
-    ]
+    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004]
     return_dict = False
 
     @slow
     def test_enro_generate_one(self):
         batch = self.tokenizer(
-            ["UN Chief Says There Is No Military Solution in Syria"],
-            return_tensors="pd",
-            return_token_type_ids=False)
+            ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="pd", return_token_type_ids=False
+        )
         model = self.model()
         translated_tokens = model.generate(**batch, max_length=128)[0]
-        decoded = self.tokenizer.batch_decode(translated_tokens,
-                                              skip_special_tokens=True)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
         self.assertEqual(self.tgt_text[0], decoded[0])
 
     @slow
     def test_enro_generate_batch(self):
-        batch = self.tokenizer(self.src_text,
-                               return_tensors="pd",
-                               padding=True,
-                               truncation=True,
-                               return_token_type_ids=False)
+        batch = self.tokenizer(
+            self.src_text, return_tensors="pd", padding=True, truncation=True, return_token_type_ids=False
+        )
         model = self.model()
-        translated_tokens = model.generate(**batch,
-                                           max_length=128,
-                                           decode_strategy="greedy_search")[0]
-        decoded = self.tokenizer.batch_decode(translated_tokens,
-                                              skip_special_tokens=True)
+        translated_tokens = model.generate(**batch, max_length=128, decode_strategy="greedy_search")[0]
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
 
         for i in range(len(self.tgt_text)):
             assert str(self.tgt_text[i]) == str(decoded[i]), f"{i}"
@@ -399,15 +359,11 @@ def test_mbart_fast_forward(self):
         }
         base_model = MBartModel(**config)
         lm_model = MBartForConditionalGeneration(base_model)
-        context = paddle.to_tensor(
-            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]],
-            dtype="int64")
-        summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]],
-                                   dtype="int64")
-        loss, logits = lm_model(input_ids=context,
-                                decoder_input_ids=summary,
-                                labels=summary,
-                                return_dict=self.return_dict)[:2]
+        context = paddle.to_tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype="int64")
+        summary = paddle.to_tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype="int64")
+        loss, logits = lm_model(
+            input_ids=context, decoder_input_ids=summary, labels=summary, return_dict=self.return_dict
+        )[:2]
         expected_shape = [*summary.shape, config["vocab_size"]]
         self.assertIsInstance(loss.item(), float)
         self.assertEqual(logits.shape, expected_shape)
@@ -419,28 +375,22 @@ class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
         " UN Chief Says There Is No Military Solution in Syria",
         " I ate lunch twice yesterday",
     ]
-    tgt_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-        "to be padded"
-    ]
+    tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
 
     @slow
     def test_fill_mask(self):
-        inputs = self.tokenizer(["One of the best <mask> I ever read!"],
-                                return_tensors="pd")
+        inputs = self.tokenizer(["One of the best <mask> I ever read!"], return_tensors="pd")
         model = self.model()
-        outputs = model.generate(
-            inputs["input_ids"],
-            decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"])[0]
-        prediction = self.tokenizer.batch_decode(
-            outputs,
-            clean_up_tokenization_spaces=True,
-            skip_special_tokens=True)[0]
+        outputs = model.generate(inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"])[
+            0
+        ]
+        prediction = self.tokenizer.batch_decode(outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True)[
+            0
+        ]
         self.assertEqual(prediction, "of the best books I ever read!")
 
 
 class MBartStandaloneDecoderModelTester:
-
     def __init__(
         self,
         parent,
@@ -497,22 +447,15 @@ def __init__(
         self.decoder_attention_idx = 1
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length],
-                               self.vocab_size,
-                               dtype="int64")
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size, dtype="int64")
 
         attention_mask = None
         if self.use_attention_mask:
-            attention_mask = ids_tensor(
-                [self.batch_size, 1, 1, self.decoder_seq_length],
-                vocab_size=2,
-                dtype="int64")
+            attention_mask = ids_tensor([self.batch_size, 1, 1, self.decoder_seq_length], vocab_size=2, dtype="int64")
 
         lm_labels = None
         if self.parent.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length],
-                                   self.vocab_size,
-                                   dtype="int64")
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size, dtype="int64")
 
         config = {
             "embed_tokens": None,
@@ -551,51 +494,35 @@ def create_and_check_decoder_model_past(
         origin_cache = model.decoder.gen_cache(encoder_output)
 
         # first forward pass
-        outputs = model(input_ids,
-                        cache=origin_cache,
-                        return_dict=self.parent.return_dict)
-        outputs_use_cache_conf = model(input_ids,
-                                       return_dict=self.parent.return_dict)
-        outputs_no_past = model(input_ids,
-                                cache=None,
-                                return_dict=self.parent.return_dict)
+        outputs = model(input_ids, cache=origin_cache, return_dict=self.parent.return_dict)
+        # outputs_use_cache_conf = model(input_ids, return_dict=self.parent.return_dict)
+        outputs_no_past = model(input_ids, cache=None, return_dict=self.parent.return_dict)
 
         # self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))  # didn't support using cache by config yet
         if not self.parent.return_dict:
-            self.parent.assertTrue(len(outputs) == len((outputs_no_past, )) + 1)
+            self.parent.assertTrue(len(outputs) == len((outputs_no_past,)) + 1)
         else:
             self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
 
         past_key_values = outputs[1]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
 
         # append to next input_ids and
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        output_from_no_past = model(next_input_ids,
-                                    return_dict=self.parent.return_dict)
+        output_from_no_past = model(next_input_ids, return_dict=self.parent.return_dict)
         if self.parent.return_dict:
             output_from_no_past = output_from_no_past[0]
-        output_from_past = model(next_tokens,
-                                 cache=past_key_values,
-                                 return_dict=self.parent.return_dict)[0]
+        output_from_past = model(next_tokens, cache=past_key_values, return_dict=self.parent.return_dict)[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[
-            -1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        assert paddle.allclose(output_from_past_slice,
-                               output_from_no_past_slice,
-                               atol=1e-3)
+        assert paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
 
     def create_and_check_decoder_model_attention_mask_past(
         self,
@@ -617,10 +544,6 @@ def create_and_check_decoder_model_attention_mask_past(
         encoder_output = paddle.randn(shape=input_ids.shape + [self.d_model])
         origin_cache = model.decoder.gen_cache(encoder_output)
 
-        cache = model.decoder.gen_cache(
-            paddle.randn(shape=[
-                input_ids.shape[0], input_ids.shape[1], config["d_model"]
-            ]))
         # first forward pass
 
         past_key_values = model(
@@ -628,54 +551,40 @@ def create_and_check_decoder_model_attention_mask_past(
             # attention_mask=attn_mask,
             decoder_attention_mask=attn_mask,
             cache=origin_cache,
-            return_dict=self.parent.return_dict)[1]
+            return_dict=self.parent.return_dict,
+        )[1]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
 
         # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor(
-            (1, ), half_seq_length, dtype="int64").item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1),
-                                              config["vocab_size"],
-                                              dtype="int64").squeeze(-1)
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length, dtype="int64").item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64").squeeze(-1)
         input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
 
         # append to next input_ids and attn_mask
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
         attn_mask = paddle.concat(
-            [
-                attn_mask,
-                paddle.ones((attn_mask.shape[0], 1, 1, 1), dtype="int64")
-            ],
+            [attn_mask, paddle.ones((attn_mask.shape[0], 1, 1, 1), dtype="int64")],
             axis=-1,
         )
         # get two different outputs
-        output_from_no_past = model(next_input_ids,
-                                    decoder_attention_mask=attn_mask,
-                                    return_dict=self.parent.return_dict)
+        output_from_no_past = model(
+            next_input_ids, decoder_attention_mask=attn_mask, return_dict=self.parent.return_dict
+        )
         if self.parent.return_dict:
             output_from_no_past = output_from_no_past[0]
-        output_from_past = model(next_tokens,
-                                 decoder_attention_mask=attn_mask,
-                                 cache=past_key_values,
-                                 return_dict=self.parent.return_dict)[0]
+        output_from_past = model(
+            next_tokens, decoder_attention_mask=attn_mask, cache=past_key_values, return_dict=self.parent.return_dict
+        )[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[
-            -1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        assert paddle.allclose(output_from_past_slice,
-                               output_from_no_past_slice,
-                               atol=1e-3)
+        assert paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -693,14 +602,16 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
-@parameterized_class(("return_dict", "use_labels"), [
-    [False, False],
-    [False, True],
-    [True, False],
-    [True, True],
-])
-class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
-                                      PaddleNLPModelTest):
+@parameterized_class(
+    ("return_dict", "use_labels"),
+    [
+        [False, False],
+        [False, True],
+        [True, False],
+        [True, True],
+    ],
+)
+class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, PaddleNLPModelTest):
     base_model_class = MBartModel
 
     all_model_classes = ()
@@ -710,18 +621,15 @@ class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
     use_labels = False
 
     def setUp(self):
-        self.model_tester = MBartStandaloneDecoderModelTester(self,
-                                                              is_training=False)
+        self.model_tester = MBartStandaloneDecoderModelTester(self, is_training=False)
 
     def test_decoder_model_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(
-            *config_and_inputs)
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
 
     def test_decoder_model_attn_mask_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            *config_and_inputs)
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
diff --git a/tests/transformers/unified_transformer/test_modeling.py b/tests/transformers/unified_transformer/test_modeling.py
index 3759a08633ed..851b80fdb436 100644
--- a/tests/transformers/unified_transformer/test_modeling.py
+++ b/tests/transformers/unified_transformer/test_modeling.py
@@ -12,28 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import datetime
-import math
-import unittest
-import numpy as np
 import random
+import unittest
 
-from tests.testing_utils import slow
-from parameterized import parameterized_class
-
-from ..test_generation_utils import GenerationTesterMixin
-from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
+import numpy as np
 import paddle
 import paddle.nn as nn
+from parameterized import parameterized_class
+
+from paddlenlp.data import Pad
 from paddlenlp.transformers import (
-    UnifiedTransformerModel,
     UnifiedTransformerLMHeadModel,
-    UnifiedTransformerForMaskedLM,
+    UnifiedTransformerModel,
     UnifiedTransformerTokenizer,
 )
-from paddlenlp.data import Pad
-from paddlenlp.data import DataCollatorWithPadding
+from tests.testing_utils import slow
+
+from ..test_generation_utils import GenerationTesterMixin
+from ..test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
 UNIFIED_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "unified_transformer-12L-cn",
@@ -43,37 +39,31 @@
 
 
 def batchify_fn(batch_examples, pad_val):
-
     def pad_mask(batch_attention_mask):
         batch_size = len(batch_attention_mask)
         max_len = max(map(len, batch_attention_mask))
-        attention_mask = np.ones(
-            (batch_size, max_len, max_len), dtype='float32') * -1e4
+        attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4
         for i, mask_data in enumerate(attention_mask):
             seq_len = len(batch_attention_mask[i])
-            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i],
-                                                       dtype='float32')
+            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
         # In order to ensure the correct broadcasting mechanism, expand one
         # dimension to the second dimension (n_head of Transformer).
         attention_mask = np.expand_dims(attention_mask, axis=1)
         return attention_mask
 
-    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype='int64')
+    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype="int64")
 
-    input_ids = pad_func([example['input_ids'] for example in batch_examples])
-    token_type_ids = pad_func(
-        [example['token_type_ids'] for example in batch_examples])
-    position_ids = pad_func(
-        [example['position_ids'] for example in batch_examples])
+    input_ids = pad_func([example["input_ids"] for example in batch_examples])
+    token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+    position_ids = pad_func([example["position_ids"] for example in batch_examples])
 
-    attention_mask = pad_mask(
-        [example['attention_mask'] for example in batch_examples])
+    attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
 
     return {
         "input_ids": paddle.to_tensor(input_ids, dtype="int64"),
         "token_type_ids": paddle.to_tensor(token_type_ids, dtype="int64"),
         "position_ids": paddle.to_tensor(position_ids, dtype="int64"),
-        "attention_mask": paddle.to_tensor(attention_mask, dtype="float32")
+        "attention_mask": paddle.to_tensor(attention_mask, dtype="float32"),
     }
 
 
@@ -92,29 +82,30 @@ def postprocess_response(token_ids, tokenizer):
 
 
 class UnifiedTransformerModelTester:
-
-    def __init__(self,
-                 parent,
-                 is_training=True,
-                 batch_size=14,
-                 seq_length=7,
-                 vocab_size=99,
-                 hidden_size=32,
-                 num_hidden_layers=5,
-                 num_attention_heads=4,
-                 intermediate_size=37,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 normalize_before=True,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 unk_token_id=0,
-                 pad_token_id=0,
-                 bos_token_id=1,
-                 eos_token_id=2,
-                 role_type_size=None):
+    def __init__(
+        self,
+        parent,
+        is_training=True,
+        batch_size=14,
+        seq_length=7,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        normalize_before=True,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        unk_token_id=0,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        role_type_size=None,
+    ):
         self.parent = parent
         self.is_training = is_training
         self.batch_size = batch_size
@@ -139,27 +130,20 @@ def __init__(self,
         self.role_type_size = role_type_size
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length],
-                               self.vocab_size,
-                               dtype="int64")
-        input_mask = random_attention_mask([self.batch_size, self.seq_length],
-                                           dtype="int64").unsqueeze([1, 2])
-        token_type_ids = ids_tensor([self.batch_size, self.seq_length],
-                                    self.type_vocab_size,
-                                    dtype="int64")
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
+        input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype="int64").unsqueeze([1, 2])
+        token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size, dtype="int64")
         position_ids = paddle.tile(
-            paddle.arange(end=self.seq_length, dtype="int64").reshape([1, -1]),
-            [self.batch_size, 1])
+            paddle.arange(end=self.seq_length, dtype="int64").reshape([1, -1]), [self.batch_size, 1]
+        )
 
         lm_labels = None
         if self.parent.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length],
-                                   self.vocab_size)
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
         config = self.get_config()
 
-        return (config, input_ids, input_mask, token_type_ids, position_ids,
-                lm_labels)
+        return (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels)
 
     def get_config(self):
         return {
@@ -180,86 +164,81 @@ def get_config(self):
             "bos_token_id": self.bos_token_id,
             "eos_token_id": self.eos_token_id,
             "mask_token_id": self.mask_token_id,
-            "role_type_size": self.role_type_size
+            "role_type_size": self.role_type_size,
         }
 
     def prepare_config_and_inputs_for_decoder(self):
-        (config, input_ids, input_mask, token_type_ids, position_ids,
-         lm_labels) = self.prepare_config_and_inputs()
-        return (config, input_ids, input_mask, token_type_ids, position_ids,
-                lm_labels)
-
-    def create_and_check_unified_transformer_model(self, config, input_ids,
-                                                   input_mask, token_type_ids,
-                                                   position_ids, *args):
+        (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels) = self.prepare_config_and_inputs()
+        return (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels)
+
+    def create_and_check_unified_transformer_model(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, *args
+    ):
         model = UnifiedTransformerModel(**config)
         model.eval()
 
-        result, cache = model(input_ids,
-                              token_type_ids=token_type_ids,
-                              position_ids=position_ids,
-                              attention_mask=input_mask,
-                              use_cache=True,
-                              return_dict=self.parent.return_dict)[:2]
+        result, cache = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )[:2]
 
-        self.parent.assertEqual(
-            result.shape, [self.batch_size, self.seq_length, self.hidden_size])
+        self.parent.assertEqual(result.shape, [self.batch_size, self.seq_length, self.hidden_size])
         self.parent.assertEqual(len(cache), config["num_hidden_layers"])
 
-    def create_and_check_unified_transformer_model_past(self, config, input_ids,
-                                                        input_mask,
-                                                        token_type_ids,
-                                                        position_ids, *args):
+    def create_and_check_unified_transformer_model_past(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, *args
+    ):
         model = UnifiedTransformerModel(**config)
         model.eval()
 
         # first forward pass
-        outputs = model(input_ids,
-                        token_type_ids=token_type_ids,
-                        position_ids=position_ids,
-                        attention_mask=input_mask,
-                        use_cache=True,
-                        return_dict=self.parent.return_dict)
-        outputs_use_cache_conf = model(input_ids,
-                                       token_type_ids=token_type_ids,
-                                       position_ids=position_ids,
-                                       attention_mask=input_mask,
-                                       return_dict=self.parent.return_dict)
-        outputs_no_past = model(input_ids,
-                                token_type_ids=token_type_ids,
-                                position_ids=position_ids,
-                                attention_mask=input_mask,
-                                use_cache=False,
-                                return_dict=self.parent.return_dict)
-
-        self.parent.assertTrue(
-            len(outputs_no_past) == len(outputs_use_cache_conf))
+        outputs = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )
+        outputs_use_cache_conf = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            return_dict=self.parent.return_dict,
+        )
+        outputs_no_past = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=False,
+            return_dict=self.parent.return_dict,
+        )
+
+        self.parent.assertTrue(len(outputs_no_past) == len(outputs_use_cache_conf))
 
         output, past = outputs[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_token_types = ids_tensor([self.batch_size, 1],
-                                      self.type_vocab_size,
-                                      dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size, dtype="int64")
         next_position = position_ids[:, -1:] + 1
 
         # append to next input_ids and token_type_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = paddle.concat([token_type_ids, next_token_types],
-                                            axis=-1)
-        next_position_ids = paddle.concat([position_ids, next_position],
-                                          axis=-1)
+        next_token_type_ids = paddle.concat([token_type_ids, next_token_types], axis=-1)
+        next_position_ids = paddle.concat([position_ids, next_position], axis=-1)
 
         input_mask_t = paddle.transpose(input_mask, perm=[0, 1, 3, 2])
         input_mask = input_mask * input_mask_t
 
-        next_attention_mask = nn.Pad2D([0, 0, 0, 1],
-                                       mode='replicate')(input_mask)
-        next_attention_mask = nn.Pad2D([0, 1, 0, 0],
-                                       value=0)(next_attention_mask)
+        next_attention_mask = nn.Pad2D([0, 0, 0, 1], mode="replicate")(input_mask)
+        next_attention_mask = nn.Pad2D([0, 1, 0, 0], value=0)(next_attention_mask)
         next_attention_mask[:, :, -1, -1] = 1
 
         output_from_no_past, cache = model(
@@ -268,69 +247,57 @@ def create_and_check_unified_transformer_model_past(self, config, input_ids,
             position_ids=next_position_ids,
             attention_mask=next_attention_mask,
             use_cache=True,
-            return_dict=self.parent.return_dict)[:2]
-        output_from_past = model(next_tokens,
-                                 token_type_ids=next_token_types,
-                                 position_ids=next_position,
-                                 attention_mask=next_attention_mask[:, :,
-                                                                    -1:, :],
-                                 use_cache=True,
-                                 cache=past,
-                                 return_dict=self.parent.return_dict)[0]
+            return_dict=self.parent.return_dict,
+        )[:2]
+        output_from_past = model(
+            next_tokens,
+            token_type_ids=next_token_types,
+            position_ids=next_position,
+            attention_mask=next_attention_mask[:, :, -1:, :],
+            use_cache=True,
+            cache=past,
+            return_dict=self.parent.return_dict,
+        )[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -1,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
     def create_and_check_unified_transformer_model_past_large_inputs(
-            self, config, input_ids, input_mask, token_type_ids, position_ids,
-            *args):
+        self, config, input_ids, input_mask, token_type_ids, position_ids, *args
+    ):
         model = UnifiedTransformerModel(**config)
         model.eval()
 
         # first forward pass
-        output, past = model(input_ids,
-                             token_type_ids=token_type_ids,
-                             position_ids=position_ids,
-                             attention_mask=input_mask,
-                             use_cache=True,
-                             return_dict=self.parent.return_dict)[:2]
+        output, past = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_token_types = ids_tensor([self.batch_size, 3],
-                                      self.type_vocab_size,
-                                      dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 3), config["vocab_size"], dtype="int64")
+        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size, dtype="int64")
         next_position = position_ids[:, -3:] + 3
 
         # append to next input_ids and token_type_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = paddle.concat([token_type_ids, next_token_types],
-                                            axis=-1)
-        next_position_ids = paddle.concat([position_ids, next_position],
-                                          axis=-1)
+        next_token_type_ids = paddle.concat([token_type_ids, next_token_types], axis=-1)
+        next_position_ids = paddle.concat([position_ids, next_position], axis=-1)
 
         input_mask_t = paddle.transpose(input_mask, perm=[0, 1, 3, 2])
         input_mask = input_mask * input_mask_t
 
-        next_attention_mask = nn.Pad2D([0, 0, 0, 3],
-                                       mode='replicate')(input_mask)
-        next_attention_mask = nn.Pad2D([0, 3, 0, 0],
-                                       value=0)(next_attention_mask)
+        next_attention_mask = nn.Pad2D([0, 0, 0, 3], mode="replicate")(input_mask)
+        next_attention_mask = nn.Pad2D([0, 3, 0, 0], value=0)(next_attention_mask)
         next_attention_mask[:, :, -1, -1] = 1
         next_attention_mask[:, :, -2, -2] = 1
         next_attention_mask[:, :, -3, -3] = 1
@@ -338,110 +305,104 @@ def create_and_check_unified_transformer_model_past_large_inputs(
         next_attention_mask[:, :, -3, -1] = 1
         next_attention_mask[:, :, -3, -2] = 1
 
-        output_from_no_past = model(next_input_ids,
-                                    token_type_ids=next_token_type_ids,
-                                    attention_mask=next_attention_mask,
-                                    position_ids=next_position_ids,
-                                    use_cache=False,
-                                    return_dict=self.parent.return_dict)
+        output_from_no_past = model(
+            next_input_ids,
+            token_type_ids=next_token_type_ids,
+            attention_mask=next_attention_mask,
+            position_ids=next_position_ids,
+            use_cache=False,
+            return_dict=self.parent.return_dict,
+        )
         if self.parent.return_dict:
             output_from_no_past = output_from_no_past[0]
-        output_from_past = model(next_tokens,
-                                 token_type_ids=next_token_types,
-                                 attention_mask=next_attention_mask[:, :,
-                                                                    -3:, :],
-                                 position_ids=next_position,
-                                 cache=past,
-                                 use_cache=True,
-                                 return_dict=self.parent.return_dict)[0]
-        self.parent.assertTrue(
-            output_from_past.shape[1] == next_tokens.shape[1])
+        output_from_past = model(
+            next_tokens,
+            token_type_ids=next_token_types,
+            attention_mask=next_attention_mask[:, :, -3:, :],
+            position_ids=next_position,
+            cache=past,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )[0]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -3:,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, :,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask,
-                                       token_type_ids, position_ids, lm_labels,
-                                       *args):
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, lm_labels, *args
+    ):
         base_model = UnifiedTransformerModel(**config)
         model = UnifiedTransformerLMHeadModel(base_model)
         model.eval()
 
-        outputs = model(input_ids,
-                        token_type_ids=token_type_ids,
-                        position_ids=position_ids,
-                        attention_mask=input_mask,
-                        labels=lm_labels,
-                        return_dict=self.parent.return_dict)
+        outputs = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            labels=lm_labels,
+            return_dict=self.parent.return_dict,
+        )
         if self.parent.use_labels:
             loss, result = outputs[:2]
             self.parent.assertIsInstance(loss.item(), float)
         else:
             result = outputs[0] if self.parent.return_dict else outputs
-        self.parent.assertEqual(
-            result.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        self.parent.assertEqual(result.shape, [self.batch_size, self.seq_length, self.vocab_size])
 
-    def create_and_check_forward_and_backwards(self, config, input_ids,
-                                               input_mask, token_type_ids,
-                                               position_ids, lm_head, *args):
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, lm_head, *args
+    ):
         base_model = UnifiedTransformerModel(**config)
         model = UnifiedTransformerLMHeadModel(base_model)
 
-        loss, logits = model(input_ids,
-                             token_type_ids=token_type_ids,
-                             attention_mask=input_mask,
-                             position_ids=position_ids,
-                             label=input_ids,
-                             return_dict=self.parent.return_dict)[:2]
+        loss, logits = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            position_ids=position_ids,
+            label=input_ids,
+            return_dict=self.parent.return_dict,
+        )[:2]
         self.parent.assertIsInstance(loss.item(), float)
-        self.parent.assertEqual(
-            logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        self.parent.assertEqual(logits.shape, [self.batch_size, self.seq_length, self.vocab_size])
         loss.backward()
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
 
-        (config, input_ids, input_mask, token_type_ids, position_ids,
-         lm_labels) = config_and_inputs
+        (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels) = config_and_inputs
 
         inputs_dict = {
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
             "attention_mask": input_mask,
-            "position_ids": position_ids
+            "position_ids": position_ids,
         }
 
         return config, inputs_dict
 
 
-@parameterized_class(("return_dict", "use_labels"), [
-    [False, False],
-    [False, True],
-    [True, False],
-    [True, True],
-])
-class UnifiedTransformerModelTest(ModelTesterMixin, GenerationTesterMixin,
-                                  unittest.TestCase):
+@parameterized_class(
+    ("return_dict", "use_labels"),
+    [
+        [False, False],
+        [False, True],
+        [True, False],
+        [True, True],
+    ],
+)
+class UnifiedTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = UnifiedTransformerModel
 
     all_model_classes = (UnifiedTransformerModel, UnifiedTransformerLMHeadModel)
-    all_generative_model_classes = {
-        UnifiedTransformerLMHeadModel:
-        (UnifiedTransformerModel, "unified_transformer")
-    }
+    all_generative_model_classes = {UnifiedTransformerLMHeadModel: (UnifiedTransformerModel, "unified_transformer")}
     test_missing_keys = False
     use_test_inputs_embeds = True
     use_labels = False
@@ -462,18 +423,15 @@ def setUp(self):
 
     def test_unified_transformer_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_unified_transformer_model(
-            *config_and_inputs)
+        self.model_tester.create_and_check_unified_transformer_model(*config_and_inputs)
 
     def test_unified_transformer_model_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_unified_transformer_model_past(
-            *config_and_inputs)
+        self.model_tester.create_and_check_unified_transformer_model_past(*config_and_inputs)
 
     def test_unified_transformer_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_unified_transformer_model_past_large_inputs(
-            *config_and_inputs)
+        self.model_tester.create_and_check_unified_transformer_model_past_large_inputs(*config_and_inputs)
 
     def test_unified_transformer_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -494,9 +452,7 @@ def test_batch_generation(self):
         ]
         inputs = []
         for seq in sentences:
-            inputs.append(
-                tokenizer.dialogue_encode(history=seq,
-                                          add_start_token_as_response=True))
+            inputs.append(tokenizer.dialogue_encode(history=seq, add_start_token_as_response=True))
 
         data = batchify_fn(inputs, tokenizer.pad_token_id)
 
@@ -505,44 +461,36 @@ def test_batch_generation(self):
         token_type_ids = data["token_type_ids"]
         attention_mask = data["attention_mask"]
 
-        outputs, _ = model.generate(input_ids=input_ids,
-                                    position_ids=position_ids,
-                                    token_type_ids=token_type_ids,
-                                    attention_mask=attention_mask,
-                                    decode_strategy="greedy_search")
+        outputs, _ = model.generate(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            decode_strategy="greedy_search",
+        )
 
-        data_non_padded = tokenizer.dialogue_encode(
-            sentences[0], add_start_token_as_response=True)
+        data_non_padded = tokenizer.dialogue_encode(sentences[0], add_start_token_as_response=True)
         output_non_padded, _ = model.generate(
-            input_ids=paddle.to_tensor(data_non_padded["input_ids"],
-                                       dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(data_non_padded["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(data_non_padded["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(data_non_padded["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
-            decode_strategy="greedy_search")
-
-        data_padded = tokenizer.dialogue_encode(
-            sentences[1], add_start_token_as_response=True)
+            input_ids=paddle.to_tensor(data_non_padded["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(data_non_padded["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(data_non_padded["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(data_non_padded["attention_mask"], dtype="float32").unsqueeze([0, 1]),
+            decode_strategy="greedy_search",
+        )
+
+        data_padded = tokenizer.dialogue_encode(sentences[1], add_start_token_as_response=True)
         output_padded, _ = model.generate(
-            input_ids=paddle.to_tensor(data_padded["input_ids"],
-                                       dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(data_padded["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(data_padded["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(data_padded["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
-            decode_strategy="greedy_search")
+            input_ids=paddle.to_tensor(data_padded["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(data_padded["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(data_padded["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(data_padded["attention_mask"], dtype="float32").unsqueeze([0, 1]),
+            decode_strategy="greedy_search",
+        )
 
         batch_out_sentence = []
         for i in range(len(outputs)):
-            batch_out_sentence.append(
-                postprocess_response(outputs[i].numpy(), tokenizer))
-        non_padded_sentence = postprocess_response(output_non_padded[0],
-                                                   tokenizer)
+            batch_out_sentence.append(postprocess_response(outputs[i].numpy(), tokenizer))
+        non_padded_sentence = postprocess_response(output_non_padded[0], tokenizer)
         padded_sentence = postprocess_response(output_padded[0], tokenizer)
 
         expected_output_sentence = [
@@ -550,12 +498,10 @@ def test_batch_generation(self):
             "是 啊 , 我 也 很开心",
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence,
-                             [non_padded_sentence, padded_sentence])
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
 
 
 class UnifiedTransformerModelLanguageGenerationTest(unittest.TestCase):
-
     def _test_lm_generate_unified_transformer_helper(
         self,
         verify_outputs=True,
@@ -600,22 +546,18 @@ def test_unified_transformer_sample(self):
 
         sequence = ["今天天气真好！"]
 
-        tokenized = tokenizer.dialogue_encode(history=sequence,
-                                              add_start_token_as_response=True)
+        tokenized = tokenizer.dialogue_encode(history=sequence, add_start_token_as_response=True)
         output_ids, _ = model.generate(
-            paddle.to_tensor(tokenized["input_ids"],
-                             dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(tokenized["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(tokenized["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(tokenized["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
+            paddle.to_tensor(tokenized["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(tokenized["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(tokenized["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(tokenized["attention_mask"], dtype="float32").unsqueeze([0, 1]),
             decode_strategy="sampling",
-            top_k=1)
+            top_k=1,
+        )
         output_str = postprocess_response(output_ids[0].numpy(), tokenizer)
 
-        EXPECTED_OUTPUT_STR = ("你 在 做 什么 呢 ?")
+        EXPECTED_OUTPUT_STR = "你 在 做 什么 呢 ?"
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
 
     def test_generate_without_input_ids(self):
diff --git a/tests/transformers/unimo/test_modeling.py b/tests/transformers/unimo/test_modeling.py
index 4378ab272c1f..b8bbd06f9910 100644
--- a/tests/transformers/unimo/test_modeling.py
+++ b/tests/transformers/unimo/test_modeling.py
@@ -12,28 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import datetime
-import math
+import random
 import unittest
+
 import numpy as np
-import random
+import paddle
+import paddle.nn as nn
 from parameterized import parameterized_class
 
+from paddlenlp.data import Pad
+from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOModel, UNIMOTokenizer
 from tests.testing_utils import slow
 
 from ..test_generation_utils import GenerationTesterMixin
-from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-import paddle
-import paddle.nn as nn
-from paddlenlp.transformers import (
-    UNIMOModel,
-    UNIMOLMHeadModel,
-    UNIMOForMaskedLM,
-    UNIMOTokenizer,
-)
-from paddlenlp.data import Pad
-from paddlenlp.data import DataCollatorWithPadding
+from ..test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
 UNIMO_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "unimo-text-1.0",
@@ -43,37 +35,31 @@
 
 
 def batchify_fn(batch_examples, pad_val):
-
     def pad_mask(batch_attention_mask):
         batch_size = len(batch_attention_mask)
         max_len = max(map(len, batch_attention_mask))
-        attention_mask = np.ones(
-            (batch_size, max_len, max_len), dtype='float32') * -1e4
+        attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4
         for i, mask_data in enumerate(attention_mask):
             seq_len = len(batch_attention_mask[i])
-            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i],
-                                                       dtype='float32')
+            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
         # In order to ensure the correct broadcasting mechanism, expand one
         # dimension to the second dimension (n_head of Transformer).
         attention_mask = np.expand_dims(attention_mask, axis=1)
         return attention_mask
 
-    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype='int64')
+    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype="int64")
 
-    input_ids = pad_func([example['input_ids'] for example in batch_examples])
-    token_type_ids = pad_func(
-        [example['token_type_ids'] for example in batch_examples])
-    position_ids = pad_func(
-        [example['position_ids'] for example in batch_examples])
+    input_ids = pad_func([example["input_ids"] for example in batch_examples])
+    token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+    position_ids = pad_func([example["position_ids"] for example in batch_examples])
 
-    attention_mask = pad_mask(
-        [example['attention_mask'] for example in batch_examples])
+    attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
 
     return {
         "input_ids": paddle.to_tensor(input_ids, dtype="int64"),
         "token_type_ids": paddle.to_tensor(token_type_ids, dtype="int64"),
         "position_ids": paddle.to_tensor(position_ids, dtype="int64"),
-        "attention_mask": paddle.to_tensor(attention_mask, dtype="float32")
+        "attention_mask": paddle.to_tensor(attention_mask, dtype="float32"),
     }
 
 
@@ -91,29 +77,30 @@ def postprocess_response(token_ids, tokenizer):
 
 
 class UNIMOModelTester:
-
-    def __init__(self,
-                 parent,
-                 is_training=True,
-                 batch_size=14,
-                 seq_length=7,
-                 vocab_size=99,
-                 hidden_size=32,
-                 num_hidden_layers=5,
-                 num_attention_heads=4,
-                 intermediate_size=37,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 normalize_before=True,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 unk_token_id=0,
-                 pad_token_id=0,
-                 bos_token_id=1,
-                 eos_token_id=2,
-                 mask_token_id=3):
+    def __init__(
+        self,
+        parent,
+        is_training=True,
+        batch_size=14,
+        seq_length=7,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        normalize_before=True,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        unk_token_id=0,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        mask_token_id=3,
+    ):
         self.parent = parent
         self.is_training = is_training
         self.batch_size = batch_size
@@ -137,27 +124,20 @@ def __init__(self,
         self.mask_token_id = mask_token_id
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length],
-                               self.vocab_size,
-                               dtype="int64")
-        input_mask = random_attention_mask([self.batch_size, self.seq_length],
-                                           dtype="int64").unsqueeze([1, 2])
-        token_type_ids = ids_tensor([self.batch_size, self.seq_length],
-                                    self.type_vocab_size,
-                                    dtype="int64")
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64")
+        input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype="int64").unsqueeze([1, 2])
+        token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size, dtype="int64")
         position_ids = paddle.tile(
-            paddle.arange(end=self.seq_length, dtype="int64").reshape([1, -1]),
-            [self.batch_size, 1])
+            paddle.arange(end=self.seq_length, dtype="int64").reshape([1, -1]), [self.batch_size, 1]
+        )
 
         config = self.get_config()
 
         lm_labels = None
         if self.parent.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length],
-                                   self.vocab_size)
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
-        return (config, input_ids, input_mask, token_type_ids, position_ids,
-                lm_labels)
+        return (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels)
 
     def get_config(self):
         return {
@@ -181,79 +161,73 @@ def get_config(self):
         }
 
     def prepare_config_and_inputs_for_decoder(self):
-        (config, input_ids, input_mask, token_type_ids, position_ids,
-         lm_labels) = self.prepare_config_and_inputs()
-        return (config, input_ids, input_mask, token_type_ids, position_ids,
-                lm_labels)
+        (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels) = self.prepare_config_and_inputs()
+        return (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels)
 
-    def create_and_check_unimo_model(self, config, input_ids, input_mask,
-                                     token_type_ids, position_ids, *args):
+    def create_and_check_unimo_model(self, config, input_ids, input_mask, token_type_ids, position_ids, *args):
         model = UNIMOModel(**config)
         model.eval()
 
-        result, cache = model(input_ids,
-                              token_type_ids=token_type_ids,
-                              position_ids=position_ids,
-                              attention_mask=input_mask,
-                              use_cache=True,
-                              return_dict=self.parent.return_dict)[:2]
+        result, cache = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )[:2]
 
-        self.parent.assertEqual(
-            result.shape, [self.batch_size, self.seq_length, self.hidden_size])
+        self.parent.assertEqual(result.shape, [self.batch_size, self.seq_length, self.hidden_size])
         self.parent.assertEqual(len(cache), config["num_hidden_layers"])
 
-    def create_and_check_unimo_model_past(self, config, input_ids, input_mask,
-                                          token_type_ids, position_ids, *args):
+    def create_and_check_unimo_model_past(self, config, input_ids, input_mask, token_type_ids, position_ids, *args):
         model = UNIMOModel(**config)
         model.eval()
 
         # first forward pass
-        outputs = model(input_ids,
-                        token_type_ids=token_type_ids,
-                        position_ids=position_ids,
-                        attention_mask=input_mask,
-                        use_cache=True,
-                        return_dict=self.parent.return_dict)
-        outputs_use_cache_conf = model(input_ids,
-                                       token_type_ids=token_type_ids,
-                                       position_ids=position_ids,
-                                       attention_mask=input_mask,
-                                       return_dict=self.parent.return_dict)
-        outputs_no_past = model(input_ids,
-                                token_type_ids=token_type_ids,
-                                position_ids=position_ids,
-                                attention_mask=input_mask,
-                                use_cache=False,
-                                return_dict=self.parent.return_dict)
-
-        self.parent.assertTrue(
-            len(outputs_no_past) == len(outputs_use_cache_conf))
+        outputs = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )
+        outputs_use_cache_conf = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            return_dict=self.parent.return_dict,
+        )
+        outputs_no_past = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=False,
+            return_dict=self.parent.return_dict,
+        )
+
+        self.parent.assertTrue(len(outputs_no_past) == len(outputs_use_cache_conf))
 
         output, past = outputs[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_token_types = ids_tensor([self.batch_size, 1],
-                                      self.type_vocab_size,
-                                      dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 1), config["vocab_size"], dtype="int64")
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size, dtype="int64")
         next_position = position_ids[:, -1:] + 1
 
         # append to next input_ids and token_type_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = paddle.concat([token_type_ids, next_token_types],
-                                            axis=-1)
-        next_position_ids = paddle.concat([position_ids, next_position],
-                                          axis=-1)
+        next_token_type_ids = paddle.concat([token_type_ids, next_token_types], axis=-1)
+        next_position_ids = paddle.concat([position_ids, next_position], axis=-1)
 
         input_mask_t = paddle.transpose(input_mask, perm=[0, 1, 3, 2])
         input_mask = input_mask * input_mask_t
 
-        next_attention_mask = nn.Pad2D([0, 0, 0, 1],
-                                       mode='replicate')(input_mask)
-        next_attention_mask = nn.Pad2D([0, 1, 0, 0],
-                                       value=0)(next_attention_mask)
+        next_attention_mask = nn.Pad2D([0, 0, 0, 1], mode="replicate")(input_mask)
+        next_attention_mask = nn.Pad2D([0, 1, 0, 0], value=0)(next_attention_mask)
         next_attention_mask[:, :, -1, -1] = 1
 
         output_from_no_past, cache = model(
@@ -262,70 +236,57 @@ def create_and_check_unimo_model_past(self, config, input_ids, input_mask,
             position_ids=next_position_ids,
             attention_mask=next_attention_mask,
             use_cache=True,
-            return_dict=self.parent.return_dict)[:2]
-        output_from_past = model(next_tokens,
-                                 token_type_ids=next_token_types,
-                                 position_ids=next_position,
-                                 attention_mask=next_attention_mask[:, :,
-                                                                    -1:, :],
-                                 use_cache=True,
-                                 cache=past,
-                                 return_dict=self.parent.return_dict)[0]
+            return_dict=self.parent.return_dict,
+        )[:2]
+        output_from_past = model(
+            next_tokens,
+            token_type_ids=next_token_types,
+            position_ids=next_position,
+            attention_mask=next_attention_mask[:, :, -1:, :],
+            use_cache=True,
+            cache=past,
+            return_dict=self.parent.return_dict,
+        )[0]
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -1,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, 0,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
-
-    def create_and_check_unimo_model_past_large_inputs(self, config, input_ids,
-                                                       input_mask,
-                                                       token_type_ids,
-                                                       position_ids, *args):
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_unimo_model_past_large_inputs(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, *args
+    ):
         model = UNIMOModel(**config)
         model.eval()
 
         # first forward pass
-        output, past = model(input_ids,
-                             token_type_ids=token_type_ids,
-                             position_ids=position_ids,
-                             attention_mask=input_mask,
-                             use_cache=True,
-                             return_dict=self.parent.return_dict)[:2]
+        output, past = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+            return_dict=self.parent.return_dict,
+        )[:2]
 
         # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3),
-                                 config["vocab_size"],
-                                 dtype="int64")
-        next_token_types = ids_tensor([self.batch_size, 3],
-                                      self.type_vocab_size,
-                                      dtype="int64")
+        next_tokens = ids_tensor((self.batch_size, 3), config["vocab_size"], dtype="int64")
+        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size, dtype="int64")
         next_position = position_ids[:, -3:] + 3
 
         # append to next input_ids and token_type_ids
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = paddle.concat([token_type_ids, next_token_types],
-                                            axis=-1)
-        next_position_ids = paddle.concat([position_ids, next_position],
-                                          axis=-1)
+        next_token_type_ids = paddle.concat([token_type_ids, next_token_types], axis=-1)
+        next_position_ids = paddle.concat([position_ids, next_position], axis=-1)
 
         input_mask_t = paddle.transpose(input_mask, perm=[0, 1, 3, 2])
         input_mask = input_mask * input_mask_t
 
-        next_attention_mask = nn.Pad2D([0, 0, 0, 3],
-                                       mode='replicate')(input_mask)
-        next_attention_mask = nn.Pad2D([0, 3, 0, 0],
-                                       value=0)(next_attention_mask)
+        next_attention_mask = nn.Pad2D([0, 0, 0, 3], mode="replicate")(input_mask)
+        next_attention_mask = nn.Pad2D([0, 3, 0, 0], value=0)(next_attention_mask)
         next_attention_mask[:, :, -1, -1] = 1
         next_attention_mask[:, :, -2, -2] = 1
         next_attention_mask[:, :, -3, -3] = 1
@@ -341,8 +302,7 @@ def create_and_check_unimo_model_past_large_inputs(self, config, input_ids,
             use_cache=False,
             return_dict=self.parent.return_dict,
         )
-        output_from_no_past = output_from_no_past[
-            0] if self.parent.return_dict else output_from_no_past
+        output_from_no_past = output_from_no_past[0] if self.parent.return_dict else output_from_no_past
         output_from_past = model(
             next_tokens,
             token_type_ids=next_token_types,
@@ -352,90 +312,84 @@ def create_and_check_unimo_model_past_large_inputs(self, config, input_ids,
             use_cache=True,
             return_dict=self.parent.return_dict,
         )[0]
-        self.parent.assertTrue(
-            output_from_past.shape[1] == next_tokens.shape[1])
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
 
         # select random slice
-        random_slice_idx = ids_tensor((1, ),
-                                      output_from_past.shape[-1],
-                                      dtype="int64").item()
-        output_from_no_past_slice = output_from_no_past[:, -3:,
-                                                        random_slice_idx].detach(
-                                                        )
-        output_from_past_slice = output_from_past[:, :,
-                                                  random_slice_idx].detach()
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1], dtype="int64").item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
 
         # test that outputs are equal for slice
-        self.parent.assertTrue(
-            paddle.allclose(output_from_past_slice,
-                            output_from_no_past_slice,
-                            atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask,
-                                       token_type_ids, position_ids, lm_labels,
-                                       *args):
+        self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, lm_labels, *args
+    ):
         base_model = UNIMOModel(**config)
         model = UNIMOLMHeadModel(base_model)
         model.eval()
 
-        outputs = model(input_ids,
-                        token_type_ids=token_type_ids,
-                        position_ids=position_ids,
-                        attention_mask=input_mask,
-                        labels=lm_labels,
-                        return_dict=self.parent.return_dict)
+        outputs = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=input_mask,
+            labels=lm_labels,
+            return_dict=self.parent.return_dict,
+        )
 
         if self.parent.use_labels:
             loss, result = outputs[:2]
             self.parent.assertIsInstance(loss.item(), float)
         else:
             result = outputs[0] if self.parent.return_dict else outputs
-        self.parent.assertEqual(
-            result.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        self.parent.assertEqual(result.shape, [self.batch_size, self.seq_length, self.vocab_size])
 
-    def create_and_check_forward_and_backwards(self, config, input_ids,
-                                               input_mask, token_type_ids,
-                                               position_ids, *args):
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, token_type_ids, position_ids, *args
+    ):
         base_model = UNIMOModel(**config)
         model = UNIMOLMHeadModel(base_model)
 
-        outputs = model(input_ids,
-                        token_type_ids=token_type_ids,
-                        attention_mask=input_mask,
-                        position_ids=position_ids,
-                        labels=input_ids,
-                        return_dict=self.parent.return_dict)
+        outputs = model(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            position_ids=position_ids,
+            labels=input_ids,
+            return_dict=self.parent.return_dict,
+        )
 
         loss, result = outputs[:2]
         self.parent.assertIsInstance(loss.item(), float)
-        self.parent.assertEqual(
-            result.shape, [self.batch_size, self.seq_length, self.vocab_size])
+        self.parent.assertEqual(result.shape, [self.batch_size, self.seq_length, self.vocab_size])
         loss.backward()
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
 
-        (config, input_ids, input_mask, token_type_ids, position_ids,
-         lm_labels) = config_and_inputs
+        (config, input_ids, input_mask, token_type_ids, position_ids, lm_labels) = config_and_inputs
 
         inputs_dict = {
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
             "attention_mask": input_mask,
-            "position_ids": position_ids
+            "position_ids": position_ids,
         }
 
         return config, inputs_dict
 
 
-@parameterized_class(("return_dict", "use_labels"), [
-    [False, False],
-    [False, True],
-    [True, False],
-    [True, True],
-])
-class UNIMOModelTest(ModelTesterMixin, GenerationTesterMixin,
-                     unittest.TestCase):
+@parameterized_class(
+    ("return_dict", "use_labels"),
+    [
+        [False, False],
+        [False, True],
+        [True, False],
+        [True, True],
+    ],
+)
+class UNIMOModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = UNIMOModel
 
     all_model_classes = (UNIMOModel, UNIMOLMHeadModel)
@@ -468,8 +422,7 @@ def test_unimo_model_past(self):
 
     def test_unimo_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_unimo_model_past_large_inputs(
-            *config_and_inputs)
+        self.model_tester.create_and_check_unimo_model_past_large_inputs(*config_and_inputs)
 
     def test_unimo_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -485,16 +438,12 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
-            [
-                "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"
-            ],
+            ["深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"],
             ["深度学习是人工智能的核心技术领域。百度飞桨很厉害。"],
         ]
         inputs = []
         for seq in sentences:
-            inputs.append(
-                tokenizer.gen_encode(source=seq[0],
-                                     add_start_token_for_decoding=True))
+            inputs.append(tokenizer.gen_encode(source=seq[0], add_start_token_for_decoding=True))
 
         data = batchify_fn(inputs, tokenizer.pad_token_id)
 
@@ -503,44 +452,36 @@ def test_batch_generation(self):
         token_type_ids = data["token_type_ids"]
         attention_mask = data["attention_mask"]
 
-        outputs, _ = model.generate(input_ids=input_ids,
-                                    position_ids=position_ids,
-                                    token_type_ids=token_type_ids,
-                                    attention_mask=attention_mask,
-                                    decode_strategy="greedy_search")
+        outputs, _ = model.generate(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            decode_strategy="greedy_search",
+        )
 
-        data_non_padded = tokenizer.gen_encode(
-            sentences[0][0], add_start_token_for_decoding=True)
+        data_non_padded = tokenizer.gen_encode(sentences[0][0], add_start_token_for_decoding=True)
         output_non_padded, _ = model.generate(
-            input_ids=paddle.to_tensor(data_non_padded["input_ids"],
-                                       dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(data_non_padded["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(data_non_padded["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(data_non_padded["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
-            decode_strategy="greedy_search")
-
-        data_padded = tokenizer.gen_encode(sentences[1][0],
-                                           add_start_token_for_decoding=True)
+            input_ids=paddle.to_tensor(data_non_padded["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(data_non_padded["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(data_non_padded["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(data_non_padded["attention_mask"], dtype="float32").unsqueeze([0, 1]),
+            decode_strategy="greedy_search",
+        )
+
+        data_padded = tokenizer.gen_encode(sentences[1][0], add_start_token_for_decoding=True)
         output_padded, _ = model.generate(
-            input_ids=paddle.to_tensor(data_padded["input_ids"],
-                                       dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(data_padded["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(data_padded["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(data_padded["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
-            decode_strategy="greedy_search")
+            input_ids=paddle.to_tensor(data_padded["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(data_padded["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(data_padded["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(data_padded["attention_mask"], dtype="float32").unsqueeze([0, 1]),
+            decode_strategy="greedy_search",
+        )
 
         batch_out_sentence = []
         for i in range(len(outputs)):
-            batch_out_sentence.append(
-                postprocess_response(outputs[i].numpy(), tokenizer))
-        non_padded_sentence = postprocess_response(output_non_padded[0],
-                                                   tokenizer)
+            batch_out_sentence.append(postprocess_response(outputs[i].numpy(), tokenizer))
+        non_padded_sentence = postprocess_response(output_non_padded[0], tokenizer)
         padded_sentence = postprocess_response(output_padded[0], tokenizer)
 
         expected_output_sentence = [
@@ -548,12 +489,10 @@ def test_batch_generation(self):
             "百 度 飞 桨 ： 人 工 智 能 的 核 心 技 术",
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence,
-                             [non_padded_sentence, padded_sentence])
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
 
 
 class UNIMOModelLanguageGenerationTest(unittest.TestCase):
-
     def _test_lm_generate_unimo_helper(
         self,
         verify_outputs=True,
@@ -591,24 +530,20 @@ def test_unimo_sample(self):
             "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"
         ]
 
-        tokenized = tokenizer.gen_encode(source=sequence[0],
-                                         add_start_token_for_decoding=True)
+        tokenized = tokenizer.gen_encode(source=sequence[0], add_start_token_for_decoding=True)
         output_ids, _ = model.generate(
-            paddle.to_tensor(tokenized["input_ids"],
-                             dtype="int64").reshape([1, -1]),
-            position_ids=paddle.to_tensor(tokenized["position_ids"],
-                                          dtype="int64").reshape([1, -1]),
-            token_type_ids=paddle.to_tensor(tokenized["token_type_ids"],
-                                            dtype="int64").reshape([1, -1]),
-            attention_mask=paddle.to_tensor(tokenized["attention_mask"],
-                                            dtype="float32").unsqueeze([0, 1]),
+            paddle.to_tensor(tokenized["input_ids"], dtype="int64").reshape([1, -1]),
+            position_ids=paddle.to_tensor(tokenized["position_ids"], dtype="int64").reshape([1, -1]),
+            token_type_ids=paddle.to_tensor(tokenized["token_type_ids"], dtype="int64").reshape([1, -1]),
+            attention_mask=paddle.to_tensor(tokenized["attention_mask"], dtype="float32").unsqueeze([0, 1]),
             decode_strategy="sampling",
-            top_k=1)
+            top_k=1,
+        )
         output_str = postprocess_response(output_ids[0].numpy(), tokenizer)
 
         print(output_str)
 
-        EXPECTED_OUTPUT_STR = ("百 度 飞 桨 ： 深 度 学 习 助 力 企 业 转 型 升 级")
+        EXPECTED_OUTPUT_STR = "百 度 飞 桨 ： 深 度 学 习 助 力 企 业 转 型 升 级"
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
 
     def test_generate_without_input_ids(self):

From 8b5766f8b67b049c8bf75d62eb0f797786335ad7 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 6 Dec 2022 08:48:25 +0000
Subject: [PATCH 09/10] change tensor.shape to paddle.shape(tensor)

---
 paddlenlp/transformers/bart/modeling.py  | 10 +++++-----
 paddlenlp/transformers/mbart/modeling.py | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py
index d0fb8d796e61..ce61c2947b05 100644
--- a/paddlenlp/transformers/bart/modeling.py
+++ b/paddlenlp/transformers/bart/modeling.py
@@ -233,10 +233,10 @@ def forward(
         if input_ids is None and inputs_embeds is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_shape = input_ids.shape
+            inputs_shape = paddle.shape(input_ids)
             input_ids = input_ids.reshape((-1, inputs_shape[-1]))
         elif inputs_embeds is not None:
-            inputs_shape = inputs_embeds.shape[:-1]
+            inputs_shape = paddle.shape(inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -361,10 +361,10 @@ def forward(
         if decoder_input_ids is not None and decoder_inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif decoder_input_ids is not None:
-            inputs_shape = decoder_input_ids.shape
+            inputs_shape = paddle.shape(decoder_input_ids)
             decoder_input_ids = decoder_input_ids.reshape((-1, inputs_shape[-1]))
         elif decoder_inputs_embeds is not None:
-            inputs_shape = decoder_inputs_embeds.shape[:-1]
+            inputs_shape = paddle.shape(decoder_inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -1050,7 +1050,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.shape[1]
+            ignored_index = paddle.shape(start_logits)[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 
diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
index 6ffb618e26cf..c7f00548b148 100644
--- a/paddlenlp/transformers/mbart/modeling.py
+++ b/paddlenlp/transformers/mbart/modeling.py
@@ -298,9 +298,9 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.shape
+            input_shape = paddle.shape(input_ids)
         elif inputs_embeds is not None:
-            input_shape = inputs_embeds.shape[:-1]
+            input_shape = paddle.shape(inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -427,10 +427,10 @@ def forward(
         if decoder_input_ids is not None and decoder_inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif decoder_input_ids is not None:
-            decoder_input_shape = decoder_input_ids.shape
+            decoder_input_shape = paddle.shape(decoder_input_ids)
             decoder_input_ids = decoder_input_ids.reshape((-1, decoder_input_shape[-1]))
         elif decoder_inputs_embeds is not None:
-            decoder_input_shape = decoder_inputs_embeds.shape[:-1]
+            decoder_input_shape = paddle.shape(decoder_inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -1114,7 +1114,7 @@ def forward(
             if start_positions.ndim > 1:
                 end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.shape[1]
+            ignored_index = paddle.shape(start_logits)[1]
             start_positions = start_positions.clip(0, ignored_index)
             end_positions = end_positions.clip(0, ignored_index)
 

From 7ed06dff3c4ec8ab0b3a4cffe310325f7436ec01 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Thu, 8 Dec 2022 07:36:17 +0000
Subject: [PATCH 10/10] fix documntes and change expand_as to expand

---
 paddlenlp/transformers/bart/modeling.py       |  8 +++---
 paddlenlp/transformers/codegen/modeling.py    |  8 ++++--
 paddlenlp/transformers/mbart/modeling.py      | 11 ++++----
 .../unified_transformer/modeling.py           | 28 ++++++++-----------
 paddlenlp/transformers/unimo/modeling.py      | 28 ++++++++-----------
 5 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py
index ce61c2947b05..3112487b3658 100644
--- a/paddlenlp/transformers/bart/modeling.py
+++ b/paddlenlp/transformers/bart/modeling.py
@@ -557,7 +557,7 @@ def forward(
         The BartModel forward method, overrides the `__call__()` special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 Indices of input sequence tokens in the vocabulary. They are
                 numerical representations of tokens that build the input sequence.
                 Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
@@ -792,7 +792,7 @@ def forward(
         The BartForSequenceClassification forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`BartModel`.
             attention_mask (Tensor, optional):
                 See :class:`BartModel`.
@@ -950,7 +950,7 @@ def forward(
         The BartForQuestionAnswering forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`BartModel`.
             attention_mask (Tensor, optional):
                 See :class:`BartModel`.
@@ -1152,7 +1152,7 @@ def forward(
         The BartForConditionalGeneration forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`BartModel`.
             attention_mask (Tensor, optional):
                 See :class:`BartModel`.
diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py
index 547119bd7394..8fd71ecb3bc8 100644
--- a/paddlenlp/transformers/codegen/modeling.py
+++ b/paddlenlp/transformers/codegen/modeling.py
@@ -424,7 +424,7 @@ def forward(
         r"""
         The CodeGenModel forward method, overrides the `__call__()` special method.
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 Indices of input sequence tokens in the vocabulary. They are
                 numerical representations of tokens that build the input sequence.
                 Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
@@ -512,7 +512,9 @@ def forward(
                         * -1e4
                     )
             else:
-                logger.warning("provided inputs_embeds without attention_mask")
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
         # For 2D attention_mask from tokenizer
         elif attention_mask.ndim == 2:
             attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
@@ -665,7 +667,7 @@ def forward(
         r"""
         The CodeGenForCausalLM forward method, overrides the __call__() special method.
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`CodeGenModel`.
             attention_mask (Tensor, optional):
                 See :class:`CodeGenModel`.
diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
index c7f00548b148..38e12121fbb9 100644
--- a/paddlenlp/transformers/mbart/modeling.py
+++ b/paddlenlp/transformers/mbart/modeling.py
@@ -624,7 +624,7 @@ def forward(
         The MBartModel forward method, overrides the `__call__()` special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 Indices of input sequence tokens in the vocabulary. They are
                 numerical representations of tokens that build the input sequence.
                 Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
@@ -720,8 +720,7 @@ def forward(
                 )
             decoder_input_ids = shift_tokens_right(input_ids, self.pad_token_id)
         if attention_mask is None and input_ids is not None:
-            # assert input_ids is not None, "input_ids should be " \
-            #                               "specified when generating attention_mask"
+            logger.warning("input_ids should be specified when generating attention_mask")
             attention_mask = (
                 paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
             )
@@ -860,7 +859,7 @@ def forward(
         The MBartForSequenceClassification forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`MBartModel`.
             attention_mask (Tensor, optional):
                 See :class:`MBartModel`.
@@ -1015,7 +1014,7 @@ def forward(
         The MBartForQuestionAnswering forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`MBartModel`.
             attention_mask (Tensor, optional):
                 See :class:`MBartModel`.
@@ -1208,7 +1207,7 @@ def forward(
         The MBartForConditionalGeneration forward method, overrides the __call__() special method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`MBartModel`.
             attention_mask (Tensor, optional):
                 See :class:`MBartModel`.
diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py
index f2b345122376..410c9d0fc320 100644
--- a/paddlenlp/transformers/unified_transformer/modeling.py
+++ b/paddlenlp/transformers/unified_transformer/modeling.py
@@ -173,9 +173,9 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, role_ids=No
         if input_ids is None and input_embeddings is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_sample = input_ids
+            inputs_shape = paddle.shape(input_ids)
         elif input_embeddings is not None:
-            inputs_sample = input_embeddings[:, :, -1]
+            inputs_shape = paddle.shape(input_embeddings)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         if input_embeddings is None:
@@ -183,9 +183,7 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, role_ids=No
 
         if position_ids is None:
             if self.pad_token_id is None:
-                position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="int64"), inputs_sample
-                )
+                position_ids = paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
             else:
                 if input_ids is not None:
                     # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
@@ -193,18 +191,14 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, role_ids=No
                     # And this is for left padding input_ids.
                     num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
                     position_ids = F.relu(
-                        paddle.expand_as(
-                            paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="float32"), inputs_sample
-                        )
-                        - num_pad
+                        paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape) - num_pad
                     ).astype("int64")
                 else:
                     logger.warning(
-                        "position_ids or pad_token_ids should be provided when input_embeds is specified, otherwise an unexpected result may be returned"
-                    )
-                    position_ids = paddle.expand_as(
-                        paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="int64"), inputs_sample
+                        "Position_ids or pad_token_ids should be provided when input_embeds is specified, "
+                        "otherwise an unexpected result may be returned since `[0, 1, ..., sequence length - 1]` will be generated as a default position_ids."
                     )
+                    position_ids = paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
             position_ids.stop_gradient = True
 
         position_embeddings = self.position_embeddings(position_ids)
@@ -369,7 +363,7 @@ def forward(
         :meth:`__call__` method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 Indices of input sequence tokens in the vocabulary. They are
                 numerical representations of tokens that build the input
                 sequence. It's data type should be `int64` and has a shape of
@@ -460,7 +454,9 @@ def forward(
                     (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4
                 ).unsqueeze([1, 2])
             else:
-                logger.warning("provided inputs_embeds without attention_mask")
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
         if attention_mask is not None:
             attention_mask.stop_gradient = True
 
@@ -547,7 +543,7 @@ def forward(
         :meth:`__call__` method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`UnifiedTransformerModel`.
             token_type_ids (Tensor):
                 See :class:`UnifiedTransformerModel`.
diff --git a/paddlenlp/transformers/unimo/modeling.py b/paddlenlp/transformers/unimo/modeling.py
index 2c55dc4a4f58..02cefa832369 100644
--- a/paddlenlp/transformers/unimo/modeling.py
+++ b/paddlenlp/transformers/unimo/modeling.py
@@ -246,9 +246,9 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, input_
         if input_ids is None and input_embeddings is None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            inputs_sample = input_ids
+            inputs_shape = paddle.shape(input_ids)
         elif input_embeddings is not None:
-            inputs_sample = input_embeddings[:, :, -1]
+            inputs_shape = paddle.shape(input_embeddings)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
         if input_embeddings is None:
@@ -256,25 +256,19 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, input_
 
         if position_ids is None:
             if self.pad_token_id is None:
-                position_ids = paddle.expand_as(
-                    paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="int64"), inputs_sample
-                )
+                position_ids = paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
             else:
                 if input_ids is not None:
                     num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
                     position_ids = F.relu(
-                        paddle.expand_as(
-                            paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="float32"), inputs_sample
-                        )
-                        - num_pad
+                        paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape) - num_pad
                     ).astype("int64")
                 else:
                     logger.warning(
-                        "position_ids or pad_token_ids should be provided when input_embeds is specified, otherwise an unexpected result may be returned"
-                    )
-                    position_ids = paddle.expand_as(
-                        paddle.arange(end=paddle.shape(inputs_sample)[1], dtype="int64"), inputs_sample
+                        "Position_ids or pad_token_ids should be provided when input_embeds is specified, "
+                        "otherwise an unexpected result may be returned since `[0, 1, ..., sequence length - 1]` will be generated as a default position_ids."
                     )
+                    position_ids = paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
             position_ids.stop_gradient = True
         position_embeddings = self.position_embeddings(position_ids)
 
@@ -437,7 +431,7 @@ def forward(
         The UNIMOModel forward method, overrides the special :meth:`__call__` method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 Indices of input sequence tokens in the vocabulary. They are
                 numerical representations of tokens that build the input sequence.
                 It's data type should be `int64` and has a shape of  [batch_size, sequence_length].
@@ -518,7 +512,9 @@ def forward(
                     (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4
                 ).unsqueeze([1, 2])
             else:
-                logger.warning("provided inputs_embeds without attention_mask")
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
 
         if attention_mask is not None:
             attention_mask.stop_gradient = True
@@ -606,7 +602,7 @@ def forward(
         :meth:`__call__` method.
 
         Args:
-            input_ids (Tensor):
+            input_ids (Tensor, optional):
                 See :class:`UNIMOModel`.
             token_type_ids (Tensor):
                 See :class:`UNIMOModel`.