Add embedding inputs to T5 model (#3668)

Yam0214 · web-flow · commit 3ca354711f15 · 2022-11-14T16:06:31.000+08:00
diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
@@ -949,16 +949,32 @@ def forward(self,
                 attention_mask=None,
                 encoder_hidden_states=None,
                 encoder_attention_mask=None,
+                inputs_embeds=None,
                 cache=None,
                 use_cache=False,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
-        assert input_ids is not None, "input_ids can not be None"
-        input_shape = input_ids.shape
-        input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
 
-        inputs_embeds = self.embed_tokens(input_ids)
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
+            )
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
 
@@ -1309,6 +1325,8 @@ def forward(self,
                 decoder_attention_mask=None,
                 encoder_output=None,
                 cache=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=True,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -1352,6 +1370,20 @@ def forward(self,
                 The `input_ids` which have their past given to this model should not be 
                 passed as input ids as they have already been computed.
                 Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. 
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used, 
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). 
+                This is useful if you want more control over how to convert `decoder_input_ids` indices 
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
             use_cache (bool, optional):
                 Whether or not to use cache. If set to `True`, `past_buckets_states` states are returned 
                 and can be used to speed up decoding. 
@@ -1445,6 +1477,7 @@ def forward(self,
             encoder_output = self.encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict)
@@ -1456,6 +1489,7 @@ def forward(self,
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
             cache=cache,
             encoder_hidden_states=hidden_states,
             encoder_attention_mask=attention_mask,
@@ -1530,6 +1564,8 @@ def forward(self,
                 encoder_output=None,
                 cache=None,
                 labels=None,
+                inputs_embeds=None,
+                decoder_inputs_embeds=None,
                 use_cache=True,
                 output_attentions=False,
                 output_hidden_states=False,
@@ -1555,6 +1591,20 @@ def forward(self,
                 selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are 
                 ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`.
                 Shape is [batch_size, sequence_length] and dtype is int64.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation 
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over 
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor , optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `past_key_values` is used, 
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful 
+                if you want more control over how to convert `decoder_input_ids` indices into associated vectors 
+                than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
             use_cache (bool, optional):
                 See :class:`T5Model`.
             output_attentions (bool, optional):
@@ -1630,6 +1680,7 @@ def forward(self,
             encoder_output = self.t5.encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict)
@@ -1641,7 +1692,7 @@ def forward(self,
 
         hidden_states = encoder_output[0]
 
-        if labels is not None and decoder_input_ids is None:
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
             # get decoder inputs from shifting lm labels to the right
             decoder_input_ids = self._shift_right(labels)
 
@@ -1658,6 +1709,7 @@ def forward(self,
         decoder_outputs = self.t5.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
             cache=cache,
             encoder_hidden_states=hidden_states,
             encoder_attention_mask=attention_mask,
@@ -1870,6 +1922,7 @@ def forward(
         encoder_hidden_states: Optional[Tuple[Tensor]] = None,
         encoder_attention_mask: Optional[Tensor] = None,
         cache=None,
+        inputs_embeds: Optional[Tensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
@@ -1878,6 +1931,7 @@ def forward(
         encoder_outputs = self.encoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             cache=cache,
diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py
@@ -537,6 +537,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     test_pruning = False
     test_resize_embeddings = True
     test_model_parallel = True
+    use_test_inputs_embeds = True
     is_encoder_decoder = True
     # The small T5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.8, 0.9]
diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py
@@ -67,6 +67,7 @@ class ModelTesterMixin:
     test_resize_position_embeddings = False
     test_mismatched_shapes = True
     test_missing_keys = True
+    use_test_inputs_embeds = False
     use_test_model_name_list = True
     is_encoder_decoder = False
     has_attentions = True
@@ -508,6 +509,48 @@ def test_resize_tokens_embeddings(self):
 
             self.assertTrue(models_equal)
 
+    def test_inputs_embeds(self):
+        # pass the test if don't need to test inputs embeddings
+        if not self.use_test_inputs_embeds:
+            return
+        # get config for model and inputs_dict for model forward
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
+        )
+        # test all model classes
+        for model_class in self.all_model_classes:
+            model = self._make_model_instance(config, model_class)
+            model.eval()
+
+            inputs = copy.deepcopy(
+                self._prepare_for_class(inputs_dict, model_class))
+
+            with paddle.no_grad():
+                ids_output = model(**inputs)
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids",
+                                               encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with paddle.no_grad():
+                embeds_output = model(**inputs)
+
+            self.assertTrue(
+                paddle.allclose(ids_output, embeds_output, rtol=1e-4,
+                                atol=1e-4))
+
     def test_model_name_list(self):
         if not self.use_test_model_name_list:
             return