Fix type promotion problem. (#8414)

zxcd · web-flow · commit 99fbc418f257 · 2024-05-11T11:42:30.000+08:00
* fix type promotion problem.
diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py
@@ -511,7 +511,9 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
     def update_scores_for_generation(scores, next_scores, length, unfinished_flag):
         # update scores
 
-        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        unfinished_scores = (scores * paddle.to_tensor(length, dtype=scores.dtype) + next_scores) / (
+            paddle.to_tensor(length, dtype=scores.dtype) + 1
+        )
         scores = paddle.where(unfinished_flag, unfinished_scores, scores)
         return scores
 
diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
@@ -165,7 +165,7 @@ def _point_score(self, inputs, labels, lengths):
         flattened_inputs = inputs.reshape([-1])
         offsets = paddle.unsqueeze(self._get_batch_index(batch_size) * seq_len * n_labels, 1)
         offsets += paddle.unsqueeze(self._get_seq_index(seq_len) * n_labels, 0)
-        flattened_tag_indices = paddle.reshape(offsets + labels, [-1])
+        flattened_tag_indices = paddle.reshape(offsets + labels.astype(offsets.dtype), [-1])
 
         scores = paddle.gather(flattened_inputs, flattened_tag_indices).reshape([batch_size, seq_len])
 
diff --git a/paddlenlp/metrics/perplexity.py b/paddlenlp/metrics/perplexity.py
@@ -92,7 +92,7 @@ def compute(self, pred, label, seq_mask=None):
         ce = F.cross_entropy(input=pred, label=label, reduction="none", soft_label=False)
         ce = paddle.squeeze(ce, axis=[2])
         if seq_mask is not None:
-            ce = ce * seq_mask
+            ce = ce * seq_mask.astype(ce.dtype)
             word_num = paddle.sum(seq_mask)
             return ce, word_num
         return ce
diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py
@@ -162,7 +162,7 @@ def aggregate(self, outputs: Tensor, mask: Tensor, atype: str):
         Aggregate multiple tokens/words for each word/label.
         """
         if atype == "mean":
-            outputs = outputs * mask
+            outputs = outputs * mask.astype(outputs.dtype)
             outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15)
         elif atype == "max":
             outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1)
diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py
@@ -1137,7 +1137,9 @@ def update_inputs(self, sequence, updates, positions):
         N = positions.shape[1]
         assert N == L, "the dimension of inputs and mask should be same as [batch_size, sequence_length]"
 
-        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
 
         return updated_sequence
 
diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
@@ -1051,7 +1051,9 @@ def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generat
         mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
         updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions)
         # use inputs and updated_input to get discriminator labels
-        labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype("int64"))
+        labels = mask_positions * (
+            paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype(raw_inputs.dtype)
+        )
         return updated_inputs, labels, sampled_tokids
 
     def sample_from_softmax(self, logits, use_softmax_sample=True):
@@ -1073,7 +1075,9 @@ def update_inputs(self, sequence, updates, positions):
         N = positions.shape[1]
         assert N == L, "the dimension of inputs and mask should be same as [B, L]"
 
-        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
 
         return updated_sequence
 
diff --git a/paddlenlp/transformers/funnel/modeling.py b/paddlenlp/transformers/funnel/modeling.py
@@ -519,7 +519,7 @@ def relative_positional_attention(self, position_embeds, q_head, context_len, cl
             positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
 
         if cls_mask is not None:
-            positional_attn *= cls_mask
+            positional_attn *= cls_mask.astype(positional_attn.dtype)
         return positional_attn
 
     def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
@@ -547,7 +547,7 @@ def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
         )
 
         if cls_mask is not None:
-            token_type_attn *= cls_mask
+            token_type_attn *= cls_mask.astype(token_type_attn.dtype)
         return token_type_attn
 
     def forward(self, query, key, value, attention_inputs, output_attentions=False):
diff --git a/paddlenlp/transformers/gptj/modeling.py b/paddlenlp/transformers/gptj/modeling.py
@@ -158,7 +158,7 @@ def _attn(
 
         if attention_mask is not None:
             # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
+            attn_weights = attn_weights + attention_mask.astype(attn_weights.dtype)
 
         attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1)
         attn_weights = attn_weights.astype(value.dtype)
diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py
@@ -63,7 +63,7 @@ def shift_tokens_right(input_ids, pad_token_id):
     batch_size, seq_length = shifted_input_ids.shape
     index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length
     index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1
-    decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos)
+    decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos.astype(index.dtype))
     shifted_input_ids[:, 1:] = shifted_input_ids[:, :-1].clone()
     shifted_input_ids[:, 0] = decoder_start_tokens
     return shifted_input_ids
diff --git a/paddlenlp/transformers/megatronbert/modeling.py b/paddlenlp/transformers/megatronbert/modeling.py
@@ -171,7 +171,7 @@ def forward(self, hidden_states, attention_mask=None):
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
-            attention_scores = attention_scores + attention_mask
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, axis=-1)
diff --git a/paddlenlp/transformers/prophetnet/modeling.py b/paddlenlp/transformers/prophetnet/modeling.py
@@ -71,12 +71,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
         )
         inv_relative_positions = paddle.abs(inv_relative_positions)
     else:
-        inv_relative_positions = (
-            paddle.cast(
-                paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
-            )
-            * inv_relative_positions
-        )
+        inv_relative_positions = paddle.cast(
+            paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
+        ) * inv_relative_positions.astype(paddle.int32)
 
     max_exact = num_buckets // 2
     is_small = paddle.less_than(inv_relative_positions, paddle.to_tensor(max_exact).cast(dtype=paddle.int32))
@@ -85,10 +82,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
     ) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
     val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1)
     val_if_large_lt = paddle.cast(paddle.less_than(val_if_large, val_if_large_num_buckets), dtype=paddle.int32)
-    val_if_large = (
-        paddle.cast(val_if_large_lt * val_if_large, dtype=paddle.int32)
-        + (1 - val_if_large_lt) * val_if_large_num_buckets
-    )
+    val_if_large = val_if_large_lt * val_if_large.astype(val_if_large_lt.dtype) + (
+        1 - val_if_large_lt
+    ) * val_if_large_num_buckets.astype(val_if_large_lt.dtype)
     rel_positions_bucket = rel_positions_bucket + paddle.where(
         is_small, paddle.cast(inv_relative_positions, dtype=paddle.int32), val_if_large
     )
diff --git a/paddlenlp/transformers/rembert/modeling.py b/paddlenlp/transformers/rembert/modeling.py
@@ -150,7 +150,7 @@ def forward(self, hidden_states, attention_mask=None):
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in RemBertModel forward() function)
-            attention_scores = attention_scores + attention_mask
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
 
         # Normalize the attention scores to probabilities.
         attention_probs = F.softmax(attention_scores, axis=-1)