From 775fed8172d7e3bbc64c1c69af8be416ff92856e Mon Sep 17 00:00:00 2001 From: Wang Huan Date: Thu, 11 Apr 2024 08:43:00 +0000 Subject: [PATCH 1/3] use tensor.shape bug not paddle.shape(tensor) --- .../language_model/moe/dygraph/modeling.py | 4 +- .../task/senti/rnn/model.py | 12 ++-- .../stacl/demo/model_demo.py | 2 +- .../simultaneous_translation/stacl/model.py | 10 ++-- examples/text_classification/rnn/model.py | 10 ++-- .../RAT-SQL/text2sql/utils/nn_utils.py | 2 +- llm/ernie-3.5-se/modeling.py | 4 +- .../language_model/gpt/auto/auto_model.py | 16 ++--- .../gpt/dygraph/hybrid_model.py | 8 +-- .../gpt/dygraph/single_model.py | 12 ++-- .../transformers/bloom/modeling.py | 2 +- .../transformers/chatglm_v2/modeling.py | 2 +- .../experimental/transformers/gpt/modeling.py | 2 +- .../transformers/llama/modeling.py | 2 +- .../experimental/transformers/opt/modeling.py | 2 +- .../transformers/qwen/modeling.py | 2 +- paddlenlp/generation/utils.py | 14 ++--- paddlenlp/layers/crf.py | 8 +-- paddlenlp/layers/globalpointer.py | 2 +- .../fast_transformer/transformer/decoder.py | 6 +- .../fast_transformer/transformer/decoding.py | 14 ++--- .../transformer/fast_transformer.py | 6 +- paddlenlp/prompt/prompt_utils.py | 4 +- paddlenlp/trainer/trainer_compress.py | 4 +- paddlenlp/transformers/albert/modeling.py | 2 +- paddlenlp/transformers/bart/modeling.py | 14 ++--- paddlenlp/transformers/bert/modeling.py | 2 +- paddlenlp/transformers/bigbird/modeling.py | 8 +-- paddlenlp/transformers/blenderbot/modeling.py | 2 +- .../transformers/blenderbot_small/modeling.py | 2 +- paddlenlp/transformers/bloom/modeling.py | 12 ++-- paddlenlp/transformers/convbert/modeling.py | 8 +-- paddlenlp/transformers/dallebart/modeling.py | 6 +- paddlenlp/transformers/deberta/modeling.py | 6 +- paddlenlp/transformers/deberta_v2/modeling.py | 6 +- paddlenlp/transformers/electra/modeling.py | 2 +- paddlenlp/transformers/ernie/modeling.py | 8 +-- paddlenlp/transformers/ernie_code/modeling.py | 20 +++---- paddlenlp/transformers/ernie_ctm/modeling.py | 2 +- paddlenlp/transformers/ernie_gen/modeling.py | 2 +- paddlenlp/transformers/ernie_gram/modeling.py | 4 +- .../transformers/ernie_layout/modeling.py | 20 +++---- paddlenlp/transformers/ernie_m/modeling.py | 4 +- paddlenlp/transformers/gau_alpha/modeling.py | 12 ++-- paddlenlp/transformers/gpt/modeling.py | 18 +++--- paddlenlp/transformers/gpt/modeling_auto.py | 10 ++-- paddlenlp/transformers/layoutlmv2/modeling.py | 6 +- paddlenlp/transformers/layoutxlm/modeling.py | 16 ++--- paddlenlp/transformers/llama/modeling.py | 2 +- paddlenlp/transformers/llama/modeling_auto.py | 2 +- .../llama/modeling_auto_static.py | 2 +- paddlenlp/transformers/mbart/modeling.py | 16 ++--- paddlenlp/transformers/minigpt4/modeling.py | 2 +- paddlenlp/transformers/mixtral/modeling.py | 2 +- paddlenlp/transformers/mobilebert/modeling.py | 4 +- paddlenlp/transformers/mt5/modeling.py | 20 +++---- paddlenlp/transformers/nezha/modeling.py | 4 +- paddlenlp/transformers/opt/modeling.py | 6 +- paddlenlp/transformers/pegasus/modeling.py | 10 ++-- paddlenlp/transformers/reformer/modeling.py | 4 +- paddlenlp/transformers/roberta/modeling.py | 10 ++-- paddlenlp/transformers/roformer/modeling.py | 6 +- paddlenlp/transformers/roformerv2/modeling.py | 10 ++-- paddlenlp/transformers/skep/modeling.py | 8 +-- paddlenlp/transformers/t5/modeling.py | 20 +++---- paddlenlp/transformers/tinybert/modeling.py | 2 +- .../transformers/transformer/modeling.py | 24 ++++---- .../unified_transformer/modeling.py | 11 ++-- paddlenlp/transformers/unimo/modeling.py | 11 ++-- paddlenlp/transformers/xlm/modeling.py | 22 ++++--- paddlenlp/transformers/xlnet/modeling.py | 60 +++++++------------ .../modules/bert_for_question_answering.py | 2 +- tests/test_tipc/transformer/modeling.py | 14 ++--- tests/transformer/modeling.py | 14 ++--- 74 files changed, 298 insertions(+), 332 deletions(-) diff --git a/examples/language_model/moe/dygraph/modeling.py b/examples/language_model/moe/dygraph/modeling.py index b45c9a465e70..17a77245ee1e 100644 --- a/examples/language_model/moe/dygraph/modeling.py +++ b/examples/language_model/moe/dygraph/modeling.py @@ -748,8 +748,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if position_ids is None: past_length = 0 if cache is not None: - past_length = paddle.shape(cache[0].k)[-2] - position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype="int64") + past_length = cache[0].k.shape[-2] + position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype="int64") position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) diff --git a/examples/model_interpretation/task/senti/rnn/model.py b/examples/model_interpretation/task/senti/rnn/model.py index 9c509e72432e..247a5f65bc5e 100644 --- a/examples/model_interpretation/task/senti/rnn/model.py +++ b/examples/model_interpretation/task/senti/rnn/model.py @@ -207,7 +207,7 @@ def forward(self, input, mask=None): # Shape: (batch_size, max_seq_len, hidden_size) h = paddle.add_n([forward_input, backward_input]) # Shape: (batch_size, hidden_size, 1) - att_weight = self.att_weight.tile(repeat_times=(paddle.shape(h)[0], 1, 1)) + att_weight = self.att_weight.tile(repeat_times=(h.shape[0], 1, 1)) # Shape: (batch_size, max_seq_len, 1) att_score = paddle.bmm(paddle.tanh(h), att_weight) if mask is not None: @@ -246,20 +246,18 @@ def forward(self, input, mask=None): Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. Defaults to `None """ - weight = self.input_weight.tile( - repeat_times=(paddle.shape(input)[0], 1, 1) - ) # tensor[batch, hidden_size, hidden_size] - bias = self.bias.tile(repeat_times=(paddle.shape(input)[0], 1, 1)) # tensor[batch, 1, hidden_size] + weight = self.input_weight.tile(repeat_times=(input.shape[0], 1, 1)) # tensor[batch, hidden_size, hidden_size] + bias = self.bias.tile(repeat_times=(input.shape[0], 1, 1)) # tensor[batch, 1, hidden_size] word_squish = paddle.bmm(input, weight) + bias # Shape: (batch_size, seq_len, hidden_size) att_context_vector = self.att_context_vector.tile( - repeat_times=(paddle.shape(input)[0], 1, 1) + repeat_times=(input.shape[0], 1, 1) ) # Shape: (batch_size, hidden_size, 1) att_score = paddle.bmm(word_squish, att_context_vector) # tensor[batch_size, seq_len, 1] if mask is not None: # mask, remove the effect of 'PAD' mask = paddle.cast(mask, dtype="float32") mask = mask.unsqueeze(axis=-1) - inf_tensor = paddle.full(shape=paddle.shape(mask), dtype="float32", fill_value=-INF) + inf_tensor = paddle.full(shape=mask.shape, dtype="float32", fill_value=-INF) att_score = paddle.multiply(att_score, mask) + paddle.multiply(inf_tensor, (1 - mask)) att_weight = F.softmax(att_score, axis=1) # tensor[batch_size, seq_len, 1] diff --git a/examples/simultaneous_translation/stacl/demo/model_demo.py b/examples/simultaneous_translation/stacl/demo/model_demo.py index 6f7a2cc7dfb4..7f73dfe06a10 100644 --- a/examples/simultaneous_translation/stacl/demo/model_demo.py +++ b/examples/simultaneous_translation/stacl/demo/model_demo.py @@ -34,7 +34,7 @@ def greedy_search(self, src_word, max_len=256, waitk=-1, caches=None, bos_id=Non So, it needsprevious state(caches) and last one of generated tokens id last time. """ - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] base_attn_bias = ( paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) diff --git a/examples/simultaneous_translation/stacl/model.py b/examples/simultaneous_translation/stacl/model.py index e987178dd87e..185156a89908 100644 --- a/examples/simultaneous_translation/stacl/model.py +++ b/examples/simultaneous_translation/stacl/model.py @@ -15,11 +15,11 @@ from __future__ import print_function import numpy as np - import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlenlp.transformers import WordEmbedding, PositionalEmbedding + +from paddlenlp.transformers import PositionalEmbedding, WordEmbedding class CrossEntropyCriterion(nn.Layer): @@ -190,8 +190,8 @@ def __init__( self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False) def forward(self, src_word, trg_word): - src_max_len = paddle.shape(src_word)[-1] - trg_max_len = paddle.shape(trg_word)[-1] + src_max_len = src_word.shape[-1] + trg_max_len = trg_word.shape[-1] base_attn_bias = ( paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) @@ -236,7 +236,7 @@ def beam_search(self, src_word, beam_size=4, max_len=256, waitk=-1): raise NotImplementedError def greedy_search(self, src_word, max_len=256, waitk=-1): - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] base_attn_bias = ( paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) diff --git a/examples/text_classification/rnn/model.py b/examples/text_classification/rnn/model.py index 7d2e4950db0b..04ce46cd62b5 100644 --- a/examples/text_classification/rnn/model.py +++ b/examples/text_classification/rnn/model.py @@ -253,7 +253,7 @@ def forward(self, input, mask=None): # Shape: (batch_size, max_seq_len, hidden_size) h = paddle.add_n([forward_input, backward_input]) # Shape: (batch_size, hidden_size, 1) - att_weight = self.att_weight.tile(repeat_times=(paddle.shape(h)[0], 1, 1)) + att_weight = self.att_weight.tile(repeat_times=(h.shape[0], 1, 1)) # Shape: (batch_size, max_seq_len, 1) att_score = paddle.bmm(paddle.tanh(h), att_weight) if mask is not None: @@ -292,19 +292,19 @@ def forward(self, input, mask=None): Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. Defaults to `None """ - weight = self.input_weight.tile(repeat_times=(paddle.shape(input)[0], 1, 1)) - bias = self.bias.tile(repeat_times=(paddle.shape(input)[0], 1, 1)) + weight = self.input_weight.tile(repeat_times=(input.shape[0], 1, 1)) + bias = self.bias.tile(repeat_times=(input.shape[0], 1, 1)) # Shape: (batch_size, max_seq_len, hidden_size) word_squish = paddle.bmm(input, weight) + bias - att_context_vector = self.att_context_vector.tile(repeat_times=(paddle.shape(input)[0], 1, 1)) + att_context_vector = self.att_context_vector.tile(repeat_times=(input.shape[0], 1, 1)) # Shape: (batch_size, max_seq_len, 1) att_score = paddle.bmm(word_squish, att_context_vector) if mask is not None: # mask, remove the effect of 'PAD' mask = paddle.cast(mask, dtype="float32") mask = mask.unsqueeze(axis=-1) - inf_tensor = paddle.full(shape=paddle.shape(mask), dtype="float32", fill_value=-INF) + inf_tensor = paddle.full(shape=mask.shape, dtype="float32", fill_value=-INF) att_score = paddle.multiply(att_score, mask) + paddle.multiply(inf_tensor, (1 - mask)) att_weight = F.softmax(att_score, axis=1) diff --git a/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py b/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py index 02d04743d52c..fe13c7d489b7 100644 --- a/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py +++ b/examples/text_to_sql/RAT-SQL/text2sql/utils/nn_utils.py @@ -74,7 +74,7 @@ def batch_gather_2d(var, indices): "shape of indices error. it should be a 2-D layers. " "but got shape = %s" % (str(indices.shape),) ) - batch_size = paddle.shape(indices)[0] + batch_size = indices.shape[0] zero = paddle.to_tensor([0], dtype="int64") one = paddle.to_tensor([1], dtype="int64") diff --git a/llm/ernie-3.5-se/modeling.py b/llm/ernie-3.5-se/modeling.py index 570433b994c9..9e1165e71a65 100644 --- a/llm/ernie-3.5-se/modeling.py +++ b/llm/ernie-3.5-se/modeling.py @@ -142,7 +142,7 @@ def scaled_dot_product_attention( query_states, key_states, value_states, attention_mask, output_attentions, config, is_causal=True ): - bsz, q_len, num_heads, _ = paddle.shape(query_states) + bsz, q_len, num_heads, _ = query_states.shape head_dim = config.hidden_size // config.num_attention_heads _, kv_seq_len, _, _ = value_states.shape @@ -1054,7 +1054,7 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids).astype(self.embed_tokens.weight.dtype) diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py index a283bb7b46fe..45f8ed4e556d 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py @@ -735,8 +735,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if position_ids is None: past_length = 0 if cache is not None: - past_length = paddle.shape(attention_mask)[-1] - 1 - position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) + past_length = attention_mask.shape[-1] - 1 + position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) position_ids = paddle.expand_as(position_ids, input_ids) @@ -753,7 +753,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(): # TODO, use registered buffer causal_mask = paddle.tensor.triu( - paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1 + paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1 ) if attention_mask is not None: if len(attention_mask.shape) == 2: @@ -972,7 +972,7 @@ def get_logits_processor( def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): - index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) + index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) @@ -1109,11 +1109,11 @@ def TopPProcess(probs, top_p, min_tokens_to_keep): probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs - batch_size, cur_len = paddle.shape(input_ids) + batch_size, cur_len = input_ids.shape # used for compute on gpu, avoid memcpy D2H cur_len_gpu = paddle.full([1], cur_len, dtype="int64") - origin_len = paddle.shape(input_ids)[1] + origin_len = input_ids.shape[1] # used for compute on gpu, avoid memcpy D2H origin_len_gpu = paddle.full([1], origin_len, dtype="int64") @@ -1167,7 +1167,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f raise ImportError( "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!" ) - top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) + top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype) # TODO fake random seed here # Users should set the random seed dynamically when inference _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100) @@ -1299,7 +1299,7 @@ def forward(self, input_ids=None, **model_kwargs): if model_kwargs.get("position_ids", None) is None: model_kwargs["position_ids"] = paddle.arange( - 0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype + 0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype ).unsqueeze(0) self.is_encoder_decoder = False diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index f47d800c5f15..38380d82f93b 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -834,8 +834,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if position_ids is None: past_length = 0 if cache is not None: - past_length = paddle.shape(attention_mask)[-1] - 1 - position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) + past_length = attention_mask.shape[-1] - 1 + position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) @@ -848,7 +848,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(): # TODO, use registered buffer causal_mask = paddle.tensor.triu( - paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1 + paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1 ) if attention_mask is not None: if len(attention_mask.shape) == 2: @@ -1301,7 +1301,7 @@ def get_logits_processor( def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): - index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) + index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 80ca22b855ca..ccbe318790c2 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -602,8 +602,8 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if position_ids is None: past_length = 0 if cache is not None: - past_length = paddle.shape(attention_mask)[-1] - 1 - position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) + past_length = attention_mask.shape[-1] - 1 + position_ids = paddle.arange(past_length, input_ids.shape[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) @@ -615,7 +615,7 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=F if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(): # TODO, use registered buffer causal_mask = paddle.tensor.triu( - paddle.ones((paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1 + paddle.ones((input_ids.shape[-1], input_ids.shape[-1])) * -1e4, diagonal=1 ) if attention_mask is not None: if len(attention_mask.shape) == 2: @@ -848,7 +848,7 @@ def get_logits_processor( def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): - index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) + index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) @@ -1039,7 +1039,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f raise ImportError( "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!" ) - top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) + top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype) _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100) else: probs = TopPProcess(probs, top_p, min_tokens_to_keep) @@ -1194,7 +1194,7 @@ def forward(self, input_ids=None, **model_kwargs): if model_kwargs.get("position_ids", None) is None: model_kwargs["position_ids"] = paddle.arange( - 0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype + 0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype ).unsqueeze(0) self.is_encoder_decoder = False diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py index 659826fe6f1b..fbb983622fef 100644 --- a/paddlenlp/experimental/transformers/bloom/modeling.py +++ b/paddlenlp/experimental/transformers/bloom/modeling.py @@ -279,7 +279,7 @@ def forward( pre_caches=pre_caches, pre_caches_length=position_offset, seq_lens=seq_len, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) # Add last hidden state diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py index 75dd08396398..712f03dde2fe 100644 --- a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py +++ b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py @@ -285,7 +285,7 @@ def forward( seq_lens=seq_lens, rotary_embs=paddle.cast(rotary_pos_emb, "float32"), rotary_emb_dims=1, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) hidden_states = self.final_layernorm(hidden_states) diff --git a/paddlenlp/experimental/transformers/gpt/modeling.py b/paddlenlp/experimental/transformers/gpt/modeling.py index 6627c9e42abb..b987c7a70974 100644 --- a/paddlenlp/experimental/transformers/gpt/modeling.py +++ b/paddlenlp/experimental/transformers/gpt/modeling.py @@ -265,7 +265,7 @@ def forward( attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype), caches=cache_kvs, seq_lens=seq_lens, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) hidden_states = self.norm(hidden_states) diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py index f22eecb15d19..a67731916a92 100644 --- a/paddlenlp/experimental/transformers/llama/modeling.py +++ b/paddlenlp/experimental/transformers/llama/modeling.py @@ -451,7 +451,7 @@ def forward( seq_lens=seq_lens, rotary_embs=new_rope, rotary_emb_dims=1, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) hidden_states = self.norm(hidden_states) diff --git a/paddlenlp/experimental/transformers/opt/modeling.py b/paddlenlp/experimental/transformers/opt/modeling.py index afcb1331b52c..2f8228a75947 100644 --- a/paddlenlp/experimental/transformers/opt/modeling.py +++ b/paddlenlp/experimental/transformers/opt/modeling.py @@ -247,7 +247,7 @@ def forward( seq_lens=seq_lens, rotary_embs=None, rotary_emb_dims=0, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) output = hidden_states diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py index fc6bb92a627d..975bf22abf15 100644 --- a/paddlenlp/experimental/transformers/qwen/modeling.py +++ b/paddlenlp/experimental/transformers/qwen/modeling.py @@ -340,7 +340,7 @@ def forward( seq_lens=seq_lens, rotary_embs=new_rope, rotary_emb_dims=1, - time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, + time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, ) hidden_states = self.ln_f(hidden_states) diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py index 625b81d765ff..ffd34b1d79cd 100644 --- a/paddlenlp/generation/utils.py +++ b/paddlenlp/generation/utils.py @@ -412,9 +412,9 @@ def get_logits_processor( @staticmethod def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **model_kwargs): - index = paddle.tile( - paddle.arange(paddle.shape(input_ids)[0], dtype="int64").unsqueeze(-1), [1, expand_size] - ).reshape([-1]) + index = paddle.tile(paddle.arange(input_ids.shape[0], dtype="int64").unsqueeze(-1), [1, expand_size]).reshape( + [-1] + ) input_ids = paddle.gather(input_ids, index) @@ -1340,11 +1340,11 @@ def sample_d2s( "you should not specify InputSpec for top_k and top_p parameters, one of InputSpec is expected" ) - batch_size, cur_len = paddle.shape(input_ids) + batch_size, cur_len = input_ids.shape # used for compute on gpu, avoid memcpy D2H cur_len_gpu = paddle.full([1], cur_len, dtype="int64") - origin_len = paddle.shape(input_ids)[1] + origin_len = input_ids.shape[1] # used for compute on gpu, avoid memcpy D2H origin_len_gpu = paddle.full([1], origin_len, dtype="int64") @@ -1384,7 +1384,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f # compute next_tokens if use_top_p: logits = logits / temperature - top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0], 1], fill_value=top_p, dtype=probs.dtype) + top_ps_tensor = paddle.full(shape=[probs.shape[0], 1], fill_value=top_p, dtype=probs.dtype) _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor) else: probs = TopKProcess(probs, top_k, min_tokens_to_keep) @@ -1428,7 +1428,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f attn_mask = model_kwargs["attention_mask"] # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static. - model_kwargs["attention_mask"] = paddle.reshape(attn_mask, paddle.shape(attn_mask)) + model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape) model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None max_new_tokens = paddle.full([1], max_new_tokens + cur_len - 1, dtype="int64") diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py index aaaec528ca5f..5dc6c6363afc 100644 --- a/paddlenlp/layers/crf.py +++ b/paddlenlp/layers/crf.py @@ -303,7 +303,7 @@ def __init__(self, transitions, with_start_stop_tag=True): if with_start_stop_tag: self.start_idx = -1 self.stop_idx = -2 - self.num_tags = paddle.shape(transitions)[0] + self.num_tags = transitions.shape[0] self._initial_alpha = None self._index = None @@ -312,7 +312,7 @@ def __init__(self, transitions, with_start_stop_tag=True): def _initialize_alpha(self, batch_size): # alpha accumulate the path value to get the different next tag - if self._initial_alpha is None or batch_size > paddle.shape(self._initial_alpha)[0]: + if self._initial_alpha is None or batch_size > self._initial_alpha.shape[0]: # Initialized by a small value. initial_alpha = paddle.full([batch_size, self.num_tags - 1], dtype="float32", fill_value=-10000.0) # alpha_start fill_value = 0. > -10000., means the first one step START gets the most score. @@ -336,7 +336,7 @@ def forward(self, inputs, lengths): The `paths` tensor containing the highest scoring tag indices. Its dtype is int64 and has a shape of `[batch_size, sequence_length]`. """ - input_shape = paddle.shape(inputs) + input_shape = inputs.shape batch_size = input_shape[0] n_label = input_shape[2] @@ -412,6 +412,6 @@ def forward(self, inputs, lengths): return scores, batch_path def _get_batch_index(self, batch_size): - if self._batch_index is None or batch_size != paddle.shape(self._batch_index)[0]: + if self._batch_index is None or batch_size != self._batch_index.shape[0]: self._batch_index = paddle.arange(end=batch_size, dtype="int64") return self._batch_index diff --git a/paddlenlp/layers/globalpointer.py b/paddlenlp/layers/globalpointer.py index d11aedc9ddb0..a76c606098b6 100644 --- a/paddlenlp/layers/globalpointer.py +++ b/paddlenlp/layers/globalpointer.py @@ -26,7 +26,7 @@ def __init__(self, dim, max_seq_len=512): self.register_buffer("cos", freqs.cos(), persistable=False) def forward(self, x, offset=0): - seqlen = paddle.shape(x)[-2] + seqlen = x.shape[-2] sin, cos = ( self.sin[offset : offset + seqlen, :], self.cos[offset : offset + seqlen, :], diff --git a/paddlenlp/ops/fast_transformer/transformer/decoder.py b/paddlenlp/ops/fast_transformer/transformer/decoder.py index 988b861e810d..82b0f2339aec 100644 --- a/paddlenlp/ops/fast_transformer/transformer/decoder.py +++ b/paddlenlp/ops/fast_transformer/transformer/decoder.py @@ -275,7 +275,7 @@ def forward( [ self_cache_key, paddle.zeros( - shape=[len(self.weights), 1, paddle.shape(memory_tensor)[0], self.n_head * self.size_per_head], + shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head], dtype=self_cache_key.dtype, ), ], @@ -285,7 +285,7 @@ def forward( [ self_cache_value, paddle.zeros( - shape=[len(self.weights), 1, paddle.shape(memory_tensor)[0], self.n_head * self.size_per_head], + shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head], dtype=self_cache_value.dtype, ), ], @@ -458,7 +458,7 @@ def __init__( self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False) def forward(self, src_word): - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] mem_seq_lens = paddle.sum( paddle.cast(src_word != self.bos_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32" ) diff --git a/paddlenlp/ops/fast_transformer/transformer/decoding.py b/paddlenlp/ops/fast_transformer/transformer/decoding.py index 8cac1f9026ba..28b30faebc2b 100644 --- a/paddlenlp/ops/fast_transformer/transformer/decoding.py +++ b/paddlenlp/ops/fast_transformer/transformer/decoding.py @@ -2572,7 +2572,7 @@ def parse_function(func_name): memory_seq_lens, self._beam_size ) else: - enc_output_shape = paddle.shape(enc_output) + enc_output_shape = enc_output.shape batch_size = enc_output_shape[0] max_seq_len = enc_output_shape[1] enc_output = enc_output.unsqueeze([1]) @@ -2995,7 +2995,7 @@ def forward( temperature=1, ): if attention_mask is None: - batch_size = paddle.shape(input_ids)[0] + batch_size = input_ids.shape[0] attention_mask = paddle.tril( paddle.ones( [batch_size, mem_seq_len, mem_seq_len], dtype="float16" if self.use_fp16_decoding else "float32" @@ -3042,7 +3042,7 @@ def forward( use_fp16_decoding=self.use_fp16_decoding, ) - output_ids = output_ids[paddle.shape(input_ids)[-1] :, :] + output_ids = output_ids[input_ids.shape[-1] :, :] if forced_eos_token_id is not None: output_ids[:, -1] = forced_eos_token_id return output_ids @@ -3100,7 +3100,7 @@ def forward( temperature=1, ): if attention_mask is None: - batch_size = paddle.shape(input_ids)[0] + batch_size = input_ids.shape[0] attention_mask = paddle.tril( paddle.ones( [batch_size, paddle.max(mem_seq_len), paddle.max(mem_seq_len)], @@ -3147,7 +3147,7 @@ def forward( use_fp16_decoding=self.use_fp16_decoding, ) - output_ids = output_ids[paddle.shape(input_ids)[-1] :, :] + output_ids = output_ids[input_ids.shape[-1] :, :] if forced_eos_token_id is not None: output_ids[:, -1] = forced_eos_token_id return output_ids @@ -4117,7 +4117,7 @@ def forward( min_length=0, ): if attention_mask is None: - batch_size, input_length = paddle.shape(input_ids) + batch_size, input_length = input_ids.shape attention_mask = paddle.unsqueeze((input_ids != pad_token_id).astype("float32"), axis=[1]) causal_mask = paddle.tril(paddle.ones([batch_size, input_length, input_length], dtype="float32")) attention_mask = paddle.logical_and(attention_mask, causal_mask) @@ -4161,7 +4161,7 @@ def forward( use_fp16_decoding=self.use_fp16_decoding, ) - output_ids = output_ids[paddle.shape(input_ids)[-1] :, :] + output_ids = output_ids[input_ids.shape[-1] :, :] if forced_eos_token_id is not None: output_ids[:, -1] = forced_eos_token_id return output_ids diff --git a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py index af0ecd3f3101..b7b87c47a4c2 100644 --- a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py +++ b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py @@ -235,7 +235,7 @@ def __init__( ) def forward(self, src_word, trg_word=None): - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) @@ -1619,7 +1619,7 @@ def forward( encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[ "encoder_output" ] - batch_size = paddle.shape(encoder_output)[0] + batch_size = encoder_output.shape[0] if seq_len is None: assert input_ids is not None, "You have to specify either input_ids when generating seq_len." seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") @@ -1649,7 +1649,7 @@ def forward( else: forced_bos_token_id = paddle.zeros([0]) elif decode_strategy == "sampling": - num_samples = paddle.shape(encoder_output)[0] + num_samples = encoder_output.shape[0] forced_bos_token_id = paddle.expand(forced_bos_token_id, shape=[num_samples, 1]) return self.decoding( diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index d230fbf1ab41..f446154aa79e 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -198,9 +198,7 @@ def masked_lm_forward_with_past_key_values( masked_lm_loss = None if labels is not None: loss_fct = paddle.nn.CrossEntropyLoss() - masked_lm_loss = loss_fct( - prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,)) - ) + masked_lm_loss = loss_fct(prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))) return MaskedLMOutput( loss=masked_lm_loss, diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py index 27629e2778e7..f2f945cd128f 100644 --- a/paddlenlp/trainer/trainer_compress.py +++ b/paddlenlp/trainer/trainer_compress.py @@ -871,9 +871,9 @@ def auto_model_dynabert_forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.") elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") diff --git a/paddlenlp/transformers/albert/modeling.py b/paddlenlp/transformers/albert/modeling.py index 362c9a5527c1..465bb0738b66 100644 --- a/paddlenlp/transformers/albert/modeling.py +++ b/paddlenlp/transformers/albert/modeling.py @@ -1408,7 +1408,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/bart/modeling.py b/paddlenlp/transformers/bart/modeling.py index e9cfcd08c33f..7b62163d6f00 100644 --- a/paddlenlp/transformers/bart/modeling.py +++ b/paddlenlp/transformers/bart/modeling.py @@ -453,10 +453,10 @@ def forward( if input_ids is None and inputs_embeds is None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - inputs_shape = paddle.shape(input_ids) + inputs_shape = input_ids.shape input_ids = input_ids.reshape((-1, inputs_shape[-1])) elif inputs_embeds is not None: - inputs_shape = paddle.shape(inputs_embeds)[:-1] + inputs_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") @@ -566,10 +566,10 @@ def forward( if decoder_input_ids is not None and decoder_inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif decoder_input_ids is not None: - inputs_shape = paddle.shape(decoder_input_ids) + inputs_shape = decoder_input_ids.shape decoder_input_ids = decoder_input_ids.reshape((-1, inputs_shape[-1])) elif decoder_inputs_embeds is not None: - inputs_shape = paddle.shape(decoder_inputs_embeds)[:-1] + inputs_shape = decoder_inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -582,7 +582,7 @@ def forward( if decoder_inputs_embeds is None: decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale - past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0 + past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0 decoder_inputs_embed_pos = self.decoder_embed_positions(inputs_shape, past_key_values_length) hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos hidden_states = self.decoder_layernorm_embedding(hidden_states) @@ -976,7 +976,7 @@ def forward( return_dict=return_dict, ) output = outputs[0] - output_shape = paddle.shape(output) + output_shape = output.shape if input_ids is not None: eos_mask = paddle.cast(input_ids == self.bart.config["eos_token_id"], dtype="int64") @@ -1168,7 +1168,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index ed980805f904..03095def4cd3 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -599,7 +599,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/bigbird/modeling.py b/paddlenlp/transformers/bigbird/modeling.py index 771367d333e2..0effe9d18137 100644 --- a/paddlenlp/transformers/bigbird/modeling.py +++ b/paddlenlp/transformers/bigbird/modeling.py @@ -238,10 +238,10 @@ def forward( inputs_embeds: Optional[Tensor] = None, ): if input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape inputs_embeds = self.word_embeddings(input_ids) else: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] if position_ids is None: ones = paddle.ones(input_shape, dtype="int64") @@ -382,7 +382,7 @@ def _process_mask(self, input_ids, inputs_embeds, attention_mask=None): if input_ids is not None: attention_mask = (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) else: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] attention_mask = paddle.zeros(input_shape, dtype=self.pooler.dense.weight.dtype) # [B, 1, T, 1] @@ -1219,7 +1219,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/blenderbot/modeling.py b/paddlenlp/transformers/blenderbot/modeling.py index 8a15bc75c173..fb1fcfcd78e0 100644 --- a/paddlenlp/transformers/blenderbot/modeling.py +++ b/paddlenlp/transformers/blenderbot/modeling.py @@ -339,7 +339,7 @@ def forward( if decoder_input_ids is None: raise ValueError("Decoder_input_ids cannot be None.") if decoder_attention_mask is None: - decoder_length = paddle.shape(decoder_input_ids)[-1] + decoder_length = decoder_input_ids.shape[-1] decoder_attention_mask = paddle.tensor.triu( (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1 ) diff --git a/paddlenlp/transformers/blenderbot_small/modeling.py b/paddlenlp/transformers/blenderbot_small/modeling.py index feedfe5f6c67..74fe6b764426 100644 --- a/paddlenlp/transformers/blenderbot_small/modeling.py +++ b/paddlenlp/transformers/blenderbot_small/modeling.py @@ -341,7 +341,7 @@ def forward( if decoder_input_ids is None: raise ValueError("Decoder_input_ids cannot be None.") if decoder_attention_mask is None: - decoder_length = paddle.shape(decoder_input_ids)[-1] + decoder_length = decoder_input_ids.shape[-1] decoder_attention_mask = paddle.tensor.triu( (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1 ) diff --git a/paddlenlp/transformers/bloom/modeling.py b/paddlenlp/transformers/bloom/modeling.py index 25f54f84d8dc..f18b88f406e0 100755 --- a/paddlenlp/transformers/bloom/modeling.py +++ b/paddlenlp/transformers/bloom/modeling.py @@ -1546,7 +1546,7 @@ def get_logits_processor( def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): - index = paddle.tile(paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) + index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) @@ -1654,12 +1654,12 @@ def TopPProcess(probs, top_p, min_tokens_to_keep): probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs - batch_size, cur_len = paddle.shape(input_ids) + batch_size, cur_len = input_ids.shape # used for compute on gpu, avoid memcpy D2H cur_len_gpu = paddle.full([1], cur_len) - origin_len = paddle.shape(input_ids)[1] + origin_len = input_ids.shape[1] # used for compute on gpu, avoid memcpy D2H origin_len_gpu = paddle.full([1], origin_len) @@ -1721,7 +1721,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f raise ImportError( "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!" ) - top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) + top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype) next_tokens = topp_sampling(probs, top_ps_tensor) else: probs = TopPProcess(probs, top_p, min_tokens_to_keep) @@ -1766,7 +1766,7 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_f attn_mask = model_kwargs["attention_mask"] # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static. - model_kwargs["attention_mask"] = paddle.reshape(attn_mask, paddle.shape(attn_mask)) + model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape) model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None max_length = paddle.to_tensor(max_length) while cur_len < max_length: @@ -1855,7 +1855,7 @@ def forward(self, input_ids=None, **model_kwargs): if model_kwargs.get("position_ids", None) is None: model_kwargs["position_ids"] = paddle.arange( - 0, paddle.shape(model_kwargs["attention_mask"])[-1], dtype=input_ids.dtype + 0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype ).unsqueeze(0) self.is_encoder_decoder = False diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py index d5ec8e843c2a..fa64a09ae2b9 100644 --- a/paddlenlp/transformers/convbert/modeling.py +++ b/paddlenlp/transformers/convbert/modeling.py @@ -172,8 +172,8 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None): v = self.v_proj(value) if self.conv_type == "sdconv": - bs = paddle.shape(q)[0] - seqlen = paddle.shape(q)[1] + bs = q.shape[0] + seqlen = q.shape[1] mixed_key_conv_attn_layer = self.key_conv_attn_layer(query) conv_attn_layer = mixed_key_conv_attn_layer * q @@ -290,7 +290,7 @@ def forward( if input_ids is not None: inputs_embeds = self.word_embeddings(input_ids) - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) @@ -1518,7 +1518,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/dallebart/modeling.py b/paddlenlp/transformers/dallebart/modeling.py index 1e5d50009363..06ced887439e 100644 --- a/paddlenlp/transformers/dallebart/modeling.py +++ b/paddlenlp/transformers/dallebart/modeling.py @@ -400,7 +400,7 @@ def forward( Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. """ if decoder_attention_mask is None: - decoder_length = paddle.shape(decoder_input_ids)[-1] + decoder_length = decoder_input_ids.shape[-1] decoder_attention_mask = paddle.triu( ( paddle.full( @@ -412,8 +412,8 @@ def forward( 1, ) decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) - past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0 - decoder_inputs_embed_pos = self.embed_positions(paddle.shape(decoder_input_ids), past_key_values_length) + past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0 + decoder_inputs_embed_pos = self.embed_positions(decoder_input_ids.shape, past_key_values_length) hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos hidden_states = self.layernorm_embedding(hidden_states) hidden_states = self.dropout(hidden_states) diff --git a/paddlenlp/transformers/deberta/modeling.py b/paddlenlp/transformers/deberta/modeling.py index 26a8e6d7789f..806e77d38cdb 100644 --- a/paddlenlp/transformers/deberta/modeling.py +++ b/paddlenlp/transformers/deberta/modeling.py @@ -1184,7 +1184,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) @@ -1331,9 +1331,9 @@ def forward( """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: - num_choices = paddle.shape(input_ids)[1] + num_choices = input_ids.shape[1] elif inputs_embeds is not None: - num_choices = paddle.shape(inputs_embeds)[1] + num_choices = inputs_embeds.shape[1] input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None inputs_embeds = ( diff --git a/paddlenlp/transformers/deberta_v2/modeling.py b/paddlenlp/transformers/deberta_v2/modeling.py index 7f0aa4679e26..0779780feaf7 100644 --- a/paddlenlp/transformers/deberta_v2/modeling.py +++ b/paddlenlp/transformers/deberta_v2/modeling.py @@ -1288,7 +1288,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) @@ -1435,9 +1435,9 @@ def forward( """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: - num_choices = paddle.shape(input_ids)[1] + num_choices = input_ids.shape[1] elif inputs_embeds is not None: - num_choices = paddle.shape(inputs_embeds)[1] + num_choices = inputs_embeds.shape[1] input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None inputs_embeds = ( diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py index b3b0b67c1a3d..722795bf6dac 100644 --- a/paddlenlp/transformers/electra/modeling.py +++ b/paddlenlp/transformers/electra/modeling.py @@ -1783,7 +1783,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py index 754e220a883b..0833a5a8b387 100644 --- a/paddlenlp/transformers/ernie/modeling.py +++ b/paddlenlp/transformers/ernie/modeling.py @@ -97,7 +97,7 @@ def forward( if input_ids is not None: inputs_embeds = self.word_embeddings(input_ids) - input_shape = inputs_embeds.shape[:-1] if in_declarative_mode() else paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] if in_declarative_mode() else inputs_embeds.shape[:-1] if position_ids is None: # maybe need use shape op to unify static graph and dynamic graph @@ -611,7 +611,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) @@ -911,7 +911,7 @@ def forward( if labels is not None and next_sentence_label is not None: loss_fct = paddle.nn.CrossEntropyLoss() masked_lm_loss = loss_fct( - prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,)) + prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)) ) next_sentence_loss = loss_fct( seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,)) @@ -1088,7 +1088,7 @@ def forward( if labels is not None: loss_fct = paddle.nn.CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct( - prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,)) + prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)) ) if not return_dict: output = (prediction_scores,) + outputs[2:] diff --git a/paddlenlp/transformers/ernie_code/modeling.py b/paddlenlp/transformers/ernie_code/modeling.py index 0649966c64f3..d83e1423b3be 100644 --- a/paddlenlp/transformers/ernie_code/modeling.py +++ b/paddlenlp/transformers/ernie_code/modeling.py @@ -286,15 +286,15 @@ def forward( # Input is (batch_size, seq_length, dim) # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head) - batch_size, seq_length = paddle.shape(hidden_states)[:2] + batch_size, seq_length = hidden_states.shape[:2] real_seq_length = seq_length if cache is not None: assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states" - real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length + real_seq_length += cache[0].shape[2] if query_length is None else query_length - key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1] + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] def shape(states): """projection""" @@ -361,7 +361,7 @@ def project(hidden_states, proj_layer, key_value_states, cache): # if key and values are already calculated # we want only the last query position bias if cache is not None: - position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :] + position_bias = position_bias[:, :, -hidden_states.shape[1] :, :] if mask is not None: position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) @@ -514,7 +514,7 @@ def forward( # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: - query_length = paddle.shape(present_key_value_state[0])[2] + query_length = present_key_value_state[0].shape[2] else: query_length = None @@ -875,10 +875,10 @@ def forward( f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") @@ -890,7 +890,7 @@ def forward( batch_size, seq_length = input_shape # required mask seq length can be calculated via length of past - mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length + mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length if use_cache is True: assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder" @@ -898,7 +898,7 @@ def forward( if attention_mask is None: attention_mask = paddle.ones(shape=[batch_size, mask_seq_length]) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = paddle.shape(encoder_hidden_states)[1] + encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64) # initialize caches with `None` if past does not exist @@ -912,7 +912,7 @@ def forward( # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states) + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape) diff --git a/paddlenlp/transformers/ernie_ctm/modeling.py b/paddlenlp/transformers/ernie_ctm/modeling.py index b7db01b3f662..c3449ddc175f 100644 --- a/paddlenlp/transformers/ernie_ctm/modeling.py +++ b/paddlenlp/transformers/ernie_ctm/modeling.py @@ -100,7 +100,7 @@ def __init__(self, config: ErnieCtmConfig): def forward(self, input_ids, token_type_ids=None, position_ids=None, inputs_embeds=None): if position_ids is None: - content_len = paddle.shape(input_ids)[1] - self.cls_num + content_len = input_ids.shape[1] - self.cls_num position_ids = paddle.concat( [ paddle.zeros(shape=[self.cls_num], dtype="int64"), diff --git a/paddlenlp/transformers/ernie_gen/modeling.py b/paddlenlp/transformers/ernie_gen/modeling.py index c0ac93636435..3a0a2f5fa3f4 100644 --- a/paddlenlp/transformers/ernie_gen/modeling.py +++ b/paddlenlp/transformers/ernie_gen/modeling.py @@ -493,7 +493,7 @@ def forward( assert ( attn_bias is not None if past_cache else True ), "if `past_cache` is specified; attn_bias should not be None" - d_seqlen = paddle.shape(src_ids)[1] + d_seqlen = src_ids.shape[1] if pos_ids is None: pos_ids = paddle.arange(0, d_seqlen, 1, dtype="int32").reshape([1, -1]).cast("int64") if attn_bias is None: diff --git a/paddlenlp/transformers/ernie_gram/modeling.py b/paddlenlp/transformers/ernie_gram/modeling.py index b4aef71dac04..438ee1b95c92 100644 --- a/paddlenlp/transformers/ernie_gram/modeling.py +++ b/paddlenlp/transformers/ernie_gram/modeling.py @@ -70,7 +70,7 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] if position_ids is None: # maybe need use shape op to unify static graph and dynamic graph @@ -556,7 +556,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/ernie_layout/modeling.py b/paddlenlp/transformers/ernie_layout/modeling.py index bef6be324175..fde6f36bce2a 100644 --- a/paddlenlp/transformers/ernie_layout/modeling.py +++ b/paddlenlp/transformers/ernie_layout/modeling.py @@ -230,7 +230,7 @@ def __init__(self, config): self.dropout = nn.Dropout(config["attention_probs_dropout_prob"]) def transpose_for_scores(self, x): - x = x.reshape([paddle.shape(x)[0], paddle.shape(x)[1], self.num_attention_heads, self.attention_head_size]) + x = x.reshape([x.shape[0], x.shape[1], self.num_attention_heads, self.attention_head_size]) return x.transpose([0, 2, 1, 3]) def compute_qkv(self, hidden_states): @@ -268,7 +268,7 @@ def forward( attention_scores += rel_2d_pos bool_attention_mask = attention_mask.astype(paddle.bool) bool_attention_mask.stop_gradient = True - attention_scores_shape = paddle.shape(attention_scores) + attention_scores_shape = attention_scores.shape attention_scores = paddle.where( bool_attention_mask.expand(attention_scores_shape), paddle.ones(attention_scores_shape) * float("-1e10"), @@ -280,9 +280,7 @@ def forward( attention_probs = self.dropout(attention_probs) context_layer = paddle.matmul(attention_probs, value_layer) context_layer = context_layer.transpose([0, 2, 1, 3]) - context_layer = context_layer.reshape( - [paddle.shape(context_layer)[0], paddle.shape(context_layer)[1], self.all_head_size] - ) + context_layer = context_layer.reshape([context_layer.shape[0], context_layer.shape[1], self.all_head_size]) if output_attentions: outputs = [context_layer, attention_probs] @@ -689,7 +687,7 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape): visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]), ], axis=-1, - ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]]) + ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]]) visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]]) return visual_bbox @@ -737,7 +735,7 @@ def forward( output_hidden_states=False, output_attentions=False, ): - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = self.config["image_feature_pool_shape"][0] * self.config["image_feature_pool_shape"][1] visual_bbox = self._calc_visual_bbox(self.config["image_feature_pool_shape"], bbox, visual_shape) @@ -844,7 +842,7 @@ def forward( head_mask=None, labels=None, ): - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = ( self.ernie_layout.config["image_feature_pool_shape"][0] @@ -1040,7 +1038,7 @@ def forward( position_ids=position_ids, head_mask=head_mask, ) - seq_length = paddle.shape(input_ids)[1] + seq_length = input_ids.shape[1] sequence_output = outputs[0][:, :seq_length] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1117,7 +1115,7 @@ def forward( position_ids=position_ids, head_mask=head_mask, ) - seq_length = paddle.shape(input_ids)[1] + seq_length = input_ids.shape[1] sequence_output = outputs[0][:, :seq_length] sequence_output = self.dropout(sequence_output) @@ -1174,7 +1172,7 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_m bbox=bbox, image=image, ) - seq_length = paddle.shape(input_ids)[1] + seq_length = input_ids.shape[1] sequence_output = sequence_output[:, :seq_length] start_logits = self.linear_start(sequence_output) start_logits = paddle.squeeze(start_logits, -1) diff --git a/paddlenlp/transformers/ernie_m/modeling.py b/paddlenlp/transformers/ernie_m/modeling.py index 9b7e89de9284..aead16f86cc5 100644 --- a/paddlenlp/transformers/ernie_m/modeling.py +++ b/paddlenlp/transformers/ernie_m/modeling.py @@ -71,7 +71,7 @@ def forward( inputs_embeds = self.word_embeddings(input_ids) if position_ids is None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] # maybe need use shape op to unify static graph and dynamic graph ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) @@ -556,7 +556,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/gau_alpha/modeling.py b/paddlenlp/transformers/gau_alpha/modeling.py index cb9d02ac755b..4a4ab981f0d2 100644 --- a/paddlenlp/transformers/gau_alpha/modeling.py +++ b/paddlenlp/transformers/gau_alpha/modeling.py @@ -58,7 +58,7 @@ def attention_normalize(a, mask=None, axis=-1, method="softmax"): if mask is not None: l = mask.sum(-1, keepdim=True) else: - l = paddle.ones_like(a) * paddle.shape(a)[-2] + l = paddle.ones_like(a) * a.shape[-2] if method == "squared_relu": return F.relu(a) ** 2 / l elif method == "softmax_plus": @@ -173,7 +173,7 @@ def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0): """ https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235 """ - shape = paddle.shape(tensor) + shape = tensor.shape if shape[0] > 10000 or shape[0] < 10: hidden_size = shape[1] else: @@ -201,7 +201,7 @@ def __init__(self, config: GAUAlphaConfig): def forward(self, x, offset=0): # x shape [batch_size, seqlen, dim] - seqlen = paddle.shape(x)[-2] + seqlen = x.shape[-2] sin, cos = ( self.sin[offset : offset + seqlen, :], self.cos[offset : offset + seqlen, :], @@ -706,13 +706,13 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None): """ # input_ids: [bs, num_choice, seq_l] - input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1])) # flat_input_ids: [bs*num_choice,seq_l] + input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1])) # flat_input_ids: [bs*num_choice,seq_l] if token_type_ids is not None: - token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1])) + token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1])) if attention_mask is not None: - attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1])) + attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1])) sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 50cfc892d336..1bca9dcbfbc0 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -733,10 +733,10 @@ def __init__( def forward(self, input_ids, position_ids=None, inputs_embeddings=None): if input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape inputs_embeddings = self.word_embeddings(input_ids) else: - input_shape = paddle.shape(inputs_embeddings)[:-1] + input_shape = inputs_embeddings.shape[:-1] if position_ids is None: ones = paddle.ones(input_shape, dtype="int64") @@ -1167,10 +1167,10 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape input_ids = input_ids.reshape((-1, input_shape[-1])) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") # input_shape => bs, seq_len @@ -1182,7 +1182,7 @@ def forward( past_length = 0 if past_key_values[0] is not None: # bs, seq_len, num_head, head_dim - past_length = paddle.shape(past_key_values[0][0])[1] + past_length = past_key_values[0][0].shape[1] position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64") position_ids = position_ids.unsqueeze(0) position_ids = paddle.expand(position_ids, input_shape) @@ -1193,7 +1193,7 @@ def forward( # TODO, use registered buffer length = input_shape[-1] if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] length = length + cache_length else: cache_length = 0 @@ -1800,16 +1800,14 @@ def forward( if input_ids is not None: sequence_lengths = (input_ids != eos_token_id).astype("int64").sum(axis=-1) - 1 else: - inputs_shape = paddle.shape(inputs_embeds)[:-1] + inputs_shape = inputs_embeds.shape[:-1] sequence_lengths = paddle.ones(inputs_shape[:-1], dtype="int64") * (inputs_shape[1] - 1) logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " "unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) - pooled_logits = logits.gather_nd( - paddle.stack([paddle.arange(paddle.shape(logits)[0]), sequence_lengths], axis=-1) - ) + pooled_logits = logits.gather_nd(paddle.stack([paddle.arange(logits.shape[0]), sequence_lengths], axis=-1)) loss = None diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py index 255763be395f..356fcb4b8442 100644 --- a/paddlenlp/transformers/gpt/modeling_auto.py +++ b/paddlenlp/transformers/gpt/modeling_auto.py @@ -626,14 +626,14 @@ def forward(self, input_ids, position_ids=None, inputs_embeddings=None): raise ValueError("You cannot specify both `inputs_embeddings` and `position_ids`)") # if input_ids is not None: - # input_shape = paddle.shape(input_ids) + # input_shape = input_ids.shape # inputs_embeddings = self.word_embeddings(input_ids) if input_ids is not None: input_shape = input_ids.shape inputs_embeddings = self.word_embeddings(input_ids) else: - input_shape = paddle.shape(inputs_embeddings)[:-1] + input_shape = inputs_embeddings.shape[:-1] if position_ids is None: ones = paddle.ones(input_shape, dtype="int64") @@ -1021,7 +1021,7 @@ def forward( input_shape = input_ids.shape input_ids = input_ids.reshape((-1, input_shape[-1])) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") # input_shape => bs, seq_len @@ -1033,7 +1033,7 @@ def forward( past_length = 0 if past_key_values[0] is not None: # bs, seq_len, num_head, head_dim - past_length = paddle.shape(past_key_values[0][0])[1] + past_length = past_key_values[0][0].shape[1] position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64") position_ids = position_ids.unsqueeze(0) position_ids = paddle.expand(position_ids, input_shape) @@ -1043,7 +1043,7 @@ def forward( # TODO, use registered buffer length = input_shape[-1] if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] length = length + cache_length else: cache_length = 0 diff --git a/paddlenlp/transformers/layoutlmv2/modeling.py b/paddlenlp/transformers/layoutlmv2/modeling.py index ce6df9f9a2f2..83212f9fe933 100644 --- a/paddlenlp/transformers/layoutlmv2/modeling.py +++ b/paddlenlp/transformers/layoutlmv2/modeling.py @@ -296,7 +296,7 @@ def forward( bool_attention_mask = attention_mask.astype(paddle.bool) bool_attention_mask.stop_gradient = True - attention_scores_shape = paddle.shape(attention_scores) + attention_scores_shape = attention_scores.shape attention_scores = paddle.where( bool_attention_mask.expand(attention_scores_shape), paddle.ones(attention_scores_shape) * float("-1e10"), @@ -711,7 +711,7 @@ def forward( output_hidden_states=False, output_attentions=False, ): - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1] @@ -745,7 +745,7 @@ def forward( visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]), ], axis=-1, - ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]]) + ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]]) visual_bbox = visual_bbox.expand([input_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]]) final_bbox = paddle.concat([bbox, visual_bbox], axis=1) diff --git a/paddlenlp/transformers/layoutxlm/modeling.py b/paddlenlp/transformers/layoutxlm/modeling.py index a0f464416594..67a9881eec42 100644 --- a/paddlenlp/transformers/layoutxlm/modeling.py +++ b/paddlenlp/transformers/layoutxlm/modeling.py @@ -319,7 +319,7 @@ def forward( attention_scores += rel_2d_pos bool_attention_mask = attention_mask.astype(paddle.bool) bool_attention_mask.stop_gradient = True - attention_scores_shape = paddle.shape(attention_scores) + attention_scores_shape = attention_scores.shape attention_scores = paddle.where( bool_attention_mask.expand(attention_scores_shape), paddle.ones(attention_scores_shape) * float("-1e10"), @@ -699,7 +699,7 @@ def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape): visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]), ], axis=-1, - ).reshape([expand_shape[0] * expand_shape[1], paddle.shape(bbox)[-1]]) + ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]]) visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]]) return visual_bbox @@ -763,7 +763,7 @@ def forward( output_hidden_states=False, output_attentions=False, ): - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1] visual_bbox = self._calc_visual_bbox(self.config.image_feature_pool_shape, bbox, visual_shape) @@ -963,7 +963,7 @@ def forward( head_mask=None, labels=None, ): - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = ( self.layoutxlm.config.image_feature_pool_shape[0] * self.layoutxlm.config.image_feature_pool_shape[1] @@ -1146,7 +1146,7 @@ def __init__(self, hidden_size=768, hidden_dropout_prob=0.1): self.loss_fct = CrossEntropyLoss() def build_relation(self, relations, entities): - batch_size, max_seq_len = paddle.shape(entities)[:2] + batch_size, max_seq_len = entities.shape[:2] new_relations = paddle.full( shape=[batch_size, max_seq_len * max_seq_len, 3], fill_value=-1, dtype=relations.dtype ) @@ -1195,7 +1195,7 @@ def build_relation(self, relations, entities): relation_per_doc_label[: len(positive_relations)] = 1 relation_per_doc = paddle.concat([reordered_relations, relation_per_doc_label], axis=1) assert len(relation_per_doc[:, 0]) != 0 - new_relations[b, 0] = paddle.shape(relation_per_doc)[0].astype(new_relations.dtype) + new_relations[b, 0] = relation_per_doc.shape[0].astype(new_relations.dtype) new_relations[b, 1 : len(relation_per_doc) + 1] = relation_per_doc # new_relations.append(relation_per_doc) return new_relations, entities @@ -1219,7 +1219,7 @@ def get_predicted_relations(self, logits, relations, entities): return pred_relations def forward(self, hidden_states, entities, relations): - batch_size, max_length, _ = paddle.shape(entities) + batch_size, max_length, _ = entities.shape relations, entities = self.build_relation(relations, entities) loss = 0 all_pred_relations = paddle.full( @@ -1257,7 +1257,7 @@ def forward(self, hidden_states, entities, relations): pred_relations = self.get_predicted_relations(logits, relation, entities[b]) if len(pred_relations) > 0: pred_relations = paddle.stack(pred_relations) - all_pred_relations[b, 0, :, :] = paddle.shape(pred_relations)[0].astype(all_pred_relations.dtype) + all_pred_relations[b, 0, :, :] = pred_relations.shape[0].astype(all_pred_relations.dtype) all_pred_relations[b, 1 : len(pred_relations) + 1, :, :] = pred_relations return loss, all_pred_relations diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index d4da1b195a94..33479efb3a4b 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -1503,7 +1503,7 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) diff --git a/paddlenlp/transformers/llama/modeling_auto.py b/paddlenlp/transformers/llama/modeling_auto.py index 21635da46cca..e096dd3e70f6 100644 --- a/paddlenlp/transformers/llama/modeling_auto.py +++ b/paddlenlp/transformers/llama/modeling_auto.py @@ -934,7 +934,7 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length if inputs_embeds is None: diff --git a/paddlenlp/transformers/llama/modeling_auto_static.py b/paddlenlp/transformers/llama/modeling_auto_static.py index 61bf3daa2529..d9af478b808c 100644 --- a/paddlenlp/transformers/llama/modeling_auto_static.py +++ b/paddlenlp/transformers/llama/modeling_auto_static.py @@ -870,7 +870,7 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length if inputs_embeds is None: diff --git a/paddlenlp/transformers/mbart/modeling.py b/paddlenlp/transformers/mbart/modeling.py index d401554fde3d..1f94cfd9e570 100644 --- a/paddlenlp/transformers/mbart/modeling.py +++ b/paddlenlp/transformers/mbart/modeling.py @@ -60,7 +60,7 @@ def shift_tokens_right(input_ids, pad_token_id): """ shifted_input_ids = input_ids.clone() input_flat = paddle.flatten(shifted_input_ids) - batch_size, seq_length = paddle.shape(shifted_input_ids) + batch_size, seq_length = shifted_input_ids.shape index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1 decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos) @@ -194,9 +194,9 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") @@ -312,10 +312,10 @@ def forward( if decoder_input_ids is not None and decoder_inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif decoder_input_ids is not None: - decoder_input_shape = paddle.shape(decoder_input_ids) + decoder_input_shape = decoder_input_ids.shape decoder_input_ids = decoder_input_ids.reshape((-1, decoder_input_shape[-1])) elif decoder_inputs_embeds is not None: - decoder_input_shape = paddle.shape(decoder_inputs_embeds)[:-1] + decoder_input_shape = decoder_inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -328,7 +328,7 @@ def forward( if decoder_inputs_embeds is None: decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale - past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0 + past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0 decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_shape, past_key_values_length) hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos @@ -730,7 +730,7 @@ def forward( return_dict=return_dict, ) output = outputs[0] - output_shape = paddle.shape(output) + output_shape = output.shape if input_ids is not None: eos_mask = paddle.cast(input_ids == self.mbart.config.eos_token_id, dtype="int64") if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1: @@ -918,7 +918,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/minigpt4/modeling.py b/paddlenlp/transformers/minigpt4/modeling.py index df100125d432..c64d49b5ab25 100644 --- a/paddlenlp/transformers/minigpt4/modeling.py +++ b/paddlenlp/transformers/minigpt4/modeling.py @@ -203,7 +203,7 @@ def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] - patch_embeds_shape = paddle.shape(patch_embeds) + patch_embeds_shape = patch_embeds.shape patch_embeds = paddle.reshape( patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1] ).transpose([0, 2, 1]) diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py index 592f9a47847a..43db27261606 100644 --- a/paddlenlp/transformers/mixtral/modeling.py +++ b/paddlenlp/transformers/mixtral/modeling.py @@ -1183,7 +1183,7 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) diff --git a/paddlenlp/transformers/mobilebert/modeling.py b/paddlenlp/transformers/mobilebert/modeling.py index af7cb52ba5ae..f2c0a086172a 100644 --- a/paddlenlp/transformers/mobilebert/modeling.py +++ b/paddlenlp/transformers/mobilebert/modeling.py @@ -674,7 +674,7 @@ def forward( if labels is not None: loss_fct = paddle.nn.CrossEntropyLoss() total_loss = loss_fct( - prediction_scores.reshape((-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1,)) + prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)) ) if not return_dict: @@ -1173,7 +1173,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/mt5/modeling.py b/paddlenlp/transformers/mt5/modeling.py index 2defa4717912..a07079746fc5 100644 --- a/paddlenlp/transformers/mt5/modeling.py +++ b/paddlenlp/transformers/mt5/modeling.py @@ -286,15 +286,15 @@ def forward( # Input is (batch_size, seq_length, dim) # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head) - batch_size, seq_length = paddle.shape(hidden_states)[:2] + batch_size, seq_length = hidden_states.shape[:2] real_seq_length = seq_length if cache is not None: assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states" - real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length + real_seq_length += cache[0].shape[2] if query_length is None else query_length - key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1] + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] def shape(states): """projection""" @@ -361,7 +361,7 @@ def project(hidden_states, proj_layer, key_value_states, cache): # if key and values are already calculated # we want only the last query position bias if cache is not None: - position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :] + position_bias = position_bias[:, :, -hidden_states.shape[1] :, :] if mask is not None: position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) @@ -514,7 +514,7 @@ def forward( # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: - query_length = paddle.shape(present_key_value_state[0])[2] + query_length = present_key_value_state[0].shape[2] else: query_length = None @@ -875,10 +875,10 @@ def forward( f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") @@ -890,7 +890,7 @@ def forward( batch_size, seq_length = input_shape # required mask seq length can be calculated via length of past - mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length + mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length if use_cache is True: assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder" @@ -898,7 +898,7 @@ def forward( if attention_mask is None: attention_mask = paddle.ones(shape=[batch_size, mask_seq_length]) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = paddle.shape(encoder_hidden_states)[1] + encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64) # initialize caches with `None` if past does not exist @@ -912,7 +912,7 @@ def forward( # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states) + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape) diff --git a/paddlenlp/transformers/nezha/modeling.py b/paddlenlp/transformers/nezha/modeling.py index dcc3e98fd649..7e078f3d5748 100644 --- a/paddlenlp/transformers/nezha/modeling.py +++ b/paddlenlp/transformers/nezha/modeling.py @@ -234,7 +234,7 @@ def forward( if input_ids is not None: inputs_embeds = self.word_embeddings(input_ids) - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) @@ -823,7 +823,7 @@ def forward( if end_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/opt/modeling.py b/paddlenlp/transformers/opt/modeling.py index c9217f316415..41cc45482004 100644 --- a/paddlenlp/transformers/opt/modeling.py +++ b/paddlenlp/transformers/opt/modeling.py @@ -889,15 +889,15 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape input_ids = input_ids.reshape((-1, input_shape[-1])) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") self.checkpoints = [] - past_key_values_length = paddle.shape(cache[0].k)[2] if cache is not None else 0 + past_key_values_length = cache[0].k.shape[2] if cache is not None else 0 seq_length_with_past = input_shape[-1] + past_key_values_length diff --git a/paddlenlp/transformers/pegasus/modeling.py b/paddlenlp/transformers/pegasus/modeling.py index 630b75549272..406f703e0c9b 100644 --- a/paddlenlp/transformers/pegasus/modeling.py +++ b/paddlenlp/transformers/pegasus/modeling.py @@ -183,7 +183,7 @@ def forward(self, input_ids: Optional[Tensor] = None, attention_mask: Optional[T if input_ids is None: raise ValueError("Input_ids cannot be None.") inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - inputs_embed_pos = self.encoder_embed_positions(paddle.shape(input_ids)) + inputs_embed_pos = self.encoder_embed_positions(input_ids.shape) hidden_states = inputs_embeds + inputs_embed_pos encoder_input = self.encoder_dropout(hidden_states) @@ -274,7 +274,7 @@ def forward( """ if decoder_attention_mask is None: - decoder_length = paddle.shape(decoder_input_ids)[-1] + decoder_length = decoder_input_ids.shape[-1] decoder_attention_mask = paddle.tensor.triu( (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1 ) @@ -286,10 +286,8 @@ def forward( decoder_input_ids ) * self.embed_scale * mix_ratio + self.embed_scale * x * (1 - mix_ratio) - past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0 - decoder_inputs_embed_pos = self.decoder_embed_positions( - paddle.shape(decoder_input_ids), past_key_values_length - ) + past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0 + decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_ids.shape, past_key_values_length) hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos decoder_input = self.decoder_dropout(hidden_states) diff --git a/paddlenlp/transformers/reformer/modeling.py b/paddlenlp/transformers/reformer/modeling.py index 8d4f784cbac5..f94d220d5ee9 100644 --- a/paddlenlp/transformers/reformer/modeling.py +++ b/paddlenlp/transformers/reformer/modeling.py @@ -464,10 +464,10 @@ def forward( ): if input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape inputs_embeds = self.word_embeddings(input_ids) else: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] if position_ids is None: ones = paddle.ones(input_shape, dtype="int64") diff --git a/paddlenlp/transformers/roberta/modeling.py b/paddlenlp/transformers/roberta/modeling.py index 23c7dce1da36..e6f42c582996 100644 --- a/paddlenlp/transformers/roberta/modeling.py +++ b/paddlenlp/transformers/roberta/modeling.py @@ -101,7 +101,7 @@ def forward( position_ids.stop_gradient = True if token_type_ids is None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] token_type_ids = paddle.zeros(input_shape, dtype="int64") position_embeddings = self.position_embeddings(position_ids) @@ -119,7 +119,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): input_shape: paddle.Tensor Returns: paddle.Tensor """ - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] sequence_length = input_shape[1] position_ids = paddle.arange(self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype="int64") @@ -643,7 +643,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) @@ -1049,9 +1049,9 @@ def forward( """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: - num_choices = paddle.shape(input_ids)[1] + num_choices = input_ids.shape[1] elif inputs_embeds is not None: - num_choices = paddle.shape(inputs_embeds)[1] + num_choices = inputs_embeds.shape[1] input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None inputs_embeds = ( diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py index d594171f9537..95c6cbbd493b 100644 --- a/paddlenlp/transformers/roformer/modeling.py +++ b/paddlenlp/transformers/roformer/modeling.py @@ -75,7 +75,7 @@ def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): inputs_embeds = self.word_embeddings(input_ids) if token_type_ids is None: - token_type_ids_shape = paddle.shape(inputs_embeds)[:-1] + token_type_ids_shape = inputs_embeds.shape[:-1] token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64") token_type_embeddings = self.token_type_embeddings(token_type_ids) @@ -97,7 +97,7 @@ def __init__(self, dim, max_position_embeddings=512): def forward(self, x, offset=0): # x shape [batch_size, num_heads, seqlen, head_dim] - seqlen = paddle.shape(x)[-2] + seqlen = x.shape[-2] sin, cos = ( self.sin[offset : offset + seqlen, :], self.cos[offset : offset + seqlen, :], @@ -683,7 +683,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/roformerv2/modeling.py b/paddlenlp/transformers/roformerv2/modeling.py index 105a013e46ca..e727109ab3a1 100644 --- a/paddlenlp/transformers/roformerv2/modeling.py +++ b/paddlenlp/transformers/roformerv2/modeling.py @@ -50,7 +50,7 @@ def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0): """ https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235 """ - shape = paddle.shape(tensor) + shape = tensor.shape if shape[0] > 10000 or shape[0] < 10: hidden_size = shape[1] else: @@ -82,7 +82,7 @@ def __init__(self, dim, max_position_embeddings=512): def forward(self, x, offset=0): # x shape [batch_size, num_heads, seqlen, head_dim] - seqlen = paddle.shape(x)[-2] + seqlen = x.shape[-2] sin, cos = ( self.sin[offset : offset + seqlen, :], self.cos[offset : offset + seqlen, :], @@ -706,13 +706,13 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None): """ # input_ids: [bs, num_choice, seq_l] - input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1])) # flat_input_ids: [bs*num_choice,seq_l] + input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1])) # flat_input_ids: [bs*num_choice,seq_l] if token_type_ids is not None: - token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1])) + token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1])) if attention_mask is not None: - attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1])) + attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1])) sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) diff --git a/paddlenlp/transformers/skep/modeling.py b/paddlenlp/transformers/skep/modeling.py index 2a5c2b544760..ce4da4bff873 100644 --- a/paddlenlp/transformers/skep/modeling.py +++ b/paddlenlp/transformers/skep/modeling.py @@ -77,7 +77,7 @@ def forward( inputs_embeds = self.word_embeddings(input_ids) if position_ids is None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] # maybe need use shape op to unify static graph and dynamic graph ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) @@ -92,7 +92,7 @@ def forward( embeddings = inputs_embeds + position_embeddings if self.type_vocab_size != 0: if token_type_ids is None: - token_type_ids_shape = paddle.shape(inputs_embeds)[:-1] + token_type_ids_shape = inputs_embeds.shape[:-1] token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64") token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings += token_type_embeddings @@ -330,7 +330,7 @@ def forward( axis=[1, 2], ) if past_key_values is not None: - batch_size = paddle.shape(past_key_values[0][0])[0] + batch_size = past_key_values[0][0].shape[0] past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) attention_mask = paddle.concat([past_mask, attention_mask], axis=-1) @@ -737,7 +737,7 @@ def forward( if attention_mask is not None: seq_lens = paddle.sum(attention_mask, axis=1, dtype="int64") else: - input_ids_shape = paddle.shape(input_ids) + input_ids_shape = input_ids.shape seq_lens = paddle.ones(shape=[input_ids_shape[0]], dtype="int64") * input_ids_shape[1] loss, prediction = None, None diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index bc27b4c91d57..596276a522f4 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -337,15 +337,15 @@ def forward( # Input is (batch_size, seq_length, dim) # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head) - batch_size, seq_length = paddle.shape(hidden_states)[:2] + batch_size, seq_length = hidden_states.shape[:2] real_seq_length = seq_length if cache is not None: assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states" - real_seq_length += paddle.shape(cache[0])[2] if query_length is None else query_length + real_seq_length += cache[0].shape[2] if query_length is None else query_length - key_length = real_seq_length if key_value_states is None else paddle.shape(key_value_states)[1] + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] def shape(states): """projection""" @@ -412,7 +412,7 @@ def project(hidden_states, proj_layer, key_value_states, cache): # if key and values are already calculated # we want only the last query position bias if cache is not None: - position_bias = position_bias[:, :, -paddle.shape(hidden_states)[1] :, :] + position_bias = position_bias[:, :, -hidden_states.shape[1] :, :] if mask is not None: position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) @@ -566,7 +566,7 @@ def forward( # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: - query_length = paddle.shape(present_key_value_state[0])[2] + query_length = present_key_value_state[0].shape[2] else: query_length = None @@ -990,10 +990,10 @@ def forward( f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: - input_shape = paddle.shape(input_ids) + input_shape = input_ids.shape # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] + input_shape = inputs_embeds.shape[:-1] else: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") @@ -1005,7 +1005,7 @@ def forward( batch_size, seq_length = input_shape # required mask seq length can be calculated via length of past - mask_seq_length = paddle.shape(cache[0][0])[2] + seq_length if cache is not None else seq_length + mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length if use_cache is True: assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder" @@ -1013,7 +1013,7 @@ def forward( if attention_mask is None: attention_mask = paddle.ones(shape=[batch_size, mask_seq_length]) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = paddle.shape(encoder_hidden_states)[1] + encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64) # initialize caches with `None` if past does not exist @@ -1027,7 +1027,7 @@ def forward( # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = paddle.shape(encoder_hidden_states) + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape) diff --git a/paddlenlp/transformers/tinybert/modeling.py b/paddlenlp/transformers/tinybert/modeling.py index 25061d998513..b853740790af 100644 --- a/paddlenlp/transformers/tinybert/modeling.py +++ b/paddlenlp/transformers/tinybert/modeling.py @@ -617,7 +617,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py index 1adb4508b386..8c61459d9659 100644 --- a/paddlenlp/transformers/transformer/modeling.py +++ b/paddlenlp/transformers/transformer/modeling.py @@ -298,7 +298,7 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T if target.dim() == lprobs.dim() - 1: target = target.unsqueeze(-1) - num_tokens = paddle.shape(lprobs)[0] + num_tokens = lprobs.shape[0] index = paddle.arange(0, num_tokens, dtype="int64").unsqueeze(-1) index = paddle.concat([index, target], axis=-1) index.stop_gradient = True @@ -498,7 +498,7 @@ def _merge_batch_beams_with_var_dim(self, c): return c def _split_batch_beams_with_var_dim(self, c): - var_dim_size = paddle.shape(c)[self.var_dim_in_state] + var_dim_size = c.shape[self.var_dim_in_state] c = paddle.reshape( c, [-1, self.beam_size] @@ -586,14 +586,14 @@ def step(self, time, inputs, states, **kwargs): if kwargs.get("trg_word", None) is not None: if paddle.in_dynamic_mode(): - if paddle.shape(kwargs.get("trg_word"))[1] > time: + if kwargs.get("trg_word").shape[1] > time: beam_search_output, beam_search_state = self.force_decoding( beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time ) else: def condition(trg_word, time): - return paddle.shape(trg_word)[1] > time + return trg_word.shape[1] > time def default_fn(beam_search_output, beam_search_state): return beam_search_output, beam_search_state @@ -624,8 +624,8 @@ def default_fn(beam_search_output, beam_search_state): return (beam_search_output, beam_search_state, next_inputs, finished) def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time): - batch_size = paddle.shape(beam_search_output.predicted_ids)[0] - beam_size = paddle.shape(beam_search_output.predicted_ids)[1] + batch_size = beam_search_output.predicted_ids.shape[0] + beam_size = beam_search_output.predicted_ids.shape[1] ids_dtype = beam_search_output.predicted_ids.dtype scores_dtype = beam_search_output.scores.dtype @@ -842,8 +842,8 @@ def forward(self, src_word, trg_word): src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]), trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len])) """ - src_max_len = paddle.shape(src_word)[-1] - trg_max_len = paddle.shape(trg_word)[-1] + src_max_len = src_word.shape[-1] + trg_max_len = trg_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 ) @@ -1050,7 +1050,7 @@ def forward(self, src_word, trg_word=None): trg_length = None if self.beam_search_version == "v1": - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 ) @@ -1124,7 +1124,7 @@ def merge_beam_dim(tensor): return paddle.reshape(tensor, [shape[0] * shape[1]] + list(shape[2:])) # run encoder - src_max_len = paddle.shape(src_word)[-1] + src_max_len = src_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 ) @@ -1252,7 +1252,7 @@ def grow_topk(i, logits, alive_seq, alive_log_probs, states): topk_seq = gather_2d(alive_seq, topk_coordinates, beam_size, batch_size) topk_seq = paddle.concat([topk_seq, paddle.reshape(topk_ids, list(topk_ids.shape[:]) + [1])], axis=2) states = update_states(states, topk_coordinates, beam_size, batch_size) - eos = paddle.full(shape=paddle.shape(topk_ids), dtype=alive_seq.dtype, fill_value=self.eos_id) + eos = paddle.full(shape=topk_ids.shape, dtype=alive_seq.dtype, fill_value=self.eos_id) topk_finished = paddle.cast(paddle.equal(topk_ids, eos), "float32") # topk_seq: [batch_size, 2*beam_size, i+1] @@ -1320,7 +1320,7 @@ def force_decoding_v2(topk_ids, topk_scores, time): return topk_ids, topk_scores def inner_loop(i, pre_word, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, caches): - trg_pos = paddle.full(shape=paddle.shape(pre_word), dtype=alive_seq.dtype, fill_value=i) + trg_pos = paddle.full(shape=pre_word.shape, dtype=alive_seq.dtype, fill_value=i) trg_emb = self.trg_word_embedding(pre_word) trg_pos_emb = self.trg_pos_embedding(trg_pos) trg_emb = trg_emb + trg_pos_emb diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py index 505baa0c86e4..fb85fc9c86e9 100644 --- a/paddlenlp/transformers/unified_transformer/modeling.py +++ b/paddlenlp/transformers/unified_transformer/modeling.py @@ -97,9 +97,9 @@ def forward( if input_ids is None and input_embeddings is None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - inputs_shape = paddle.shape(input_ids) + inputs_shape = input_ids.shape elif input_embeddings is not None: - inputs_shape = paddle.shape(input_embeddings)[:-1] + inputs_shape = input_embeddings.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if input_embeddings is None: @@ -524,17 +524,14 @@ def prepare_inputs_for_generation( if position_ids is None: if self.pad_token_id is None: - position_ids = paddle.expand_as( - paddle.arange(end=paddle.shape(input_ids)[1], dtype="int64"), input_ids - ) + position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids) else: # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong. # In that case, the position_ids must be provided. # And this is for left padding input_ids. num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True) position_ids = F.relu( - paddle.expand_as(paddle.arange(end=paddle.shape(input_ids)[1], dtype="float32"), input_ids) - - num_pad + paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad ).astype("int64") position_ids.stop_gradient = True diff --git a/paddlenlp/transformers/unimo/modeling.py b/paddlenlp/transformers/unimo/modeling.py index fd633e5e4a67..fc5b0389d0db 100644 --- a/paddlenlp/transformers/unimo/modeling.py +++ b/paddlenlp/transformers/unimo/modeling.py @@ -89,9 +89,9 @@ def forward( if input_ids is None and input_embeddings is None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - inputs_shape = paddle.shape(input_ids) + inputs_shape = input_ids.shape elif input_embeddings is not None: - inputs_shape = paddle.shape(input_embeddings)[:-1] + inputs_shape = input_embeddings.shape[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if input_embeddings is None: @@ -505,14 +505,11 @@ def prepare_inputs_for_generation( if position_ids is None: if self.pad_token_id is None: - position_ids = paddle.expand_as( - paddle.arange(end=paddle.shape(input_ids)[1], dtype="int64"), input_ids - ) + position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids) else: num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True) position_ids = F.relu( - paddle.expand_as(paddle.arange(end=paddle.shape(input_ids)[1], dtype="float32"), input_ids) - - num_pad + paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad ).astype("int64") position_ids.stop_gradient = True diff --git a/paddlenlp/transformers/xlm/modeling.py b/paddlenlp/transformers/xlm/modeling.py index 8e28666891c0..4f8fb0b8585f 100644 --- a/paddlenlp/transformers/xlm/modeling.py +++ b/paddlenlp/transformers/xlm/modeling.py @@ -49,7 +49,7 @@ def __init__(self, num_embeddings, embedding_dim): @staticmethod def _init_weight(out): - n_pos, dim = paddle.shape(out) + n_pos, dim = out.shape out.stop_gradient = True position_ids = paddle.arange(0, n_pos, dtype=out.dtype).unsqueeze(1) indices = paddle.arange(0, dim // 2, dtype=out.dtype).unsqueeze(0) @@ -75,7 +75,7 @@ def get_masks(seqlen, lengths, causal, padding_mask=None): mask = alen < lengths[:, None] # attention mask is the same as mask, or triangular inferior attention (causal) - bs = paddle.shape(lengths)[0] + bs = lengths.shape[0] if causal: attn_mask = paddle.tile(alen[None, None, :], (bs, seqlen, 1)) <= alen[None, :, None] else: @@ -115,11 +115,11 @@ def forward(self, input, mask, kv=None, cache=None, output_attentions=False): """ # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) - bs, qlen, dim = paddle.shape(input) + bs, qlen, dim = input.shape if kv is None: klen = qlen if cache is None else cache["seqlen"] + qlen else: - klen = paddle.shape(kv)[1] + klen = kv.shape[1] mask_reshape = (bs, 1, qlen, klen) if mask.ndim == 3 else (bs, 1, 1, klen) @@ -384,7 +384,7 @@ def forward( last_hidden_state = model(**inputs)[0] """ - bs, seqlen = paddle.shape(input_ids) + bs, seqlen = input_ids.shape if lengths is None: if input_ids is not None: @@ -448,7 +448,7 @@ def forward( # update cache length if cache is not None: - cache["seqlen"] += paddle.shape(tensor)[1] + cache["seqlen"] += tensor.shape[1] return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) @@ -864,18 +864,16 @@ def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids= """ num_choices = input_ids.shape[1] # input_ids: [bs, num_choice, seqlen] - input_ids = input_ids.reshape( - shape=(-1, paddle.shape(input_ids)[-1]) - ) # flat_input_ids: [bs*num_choice, seqlen] + input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1])) # flat_input_ids: [bs*num_choice, seqlen] if langs is not None: - langs = langs.reshape(shape=(-1, paddle.shape(langs)[-1])) + langs = langs.reshape(shape=(-1, langs.shape[-1])) if attention_mask is not None: - attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1])) + attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1])) if position_ids is not None: - position_ids = position_ids.reshape(shape=(-1, paddle.shape(position_ids)[-1])) + position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1])) if lengths is not None: lengths = lengths.reshape(shape=(-1,)) diff --git a/paddlenlp/transformers/xlnet/modeling.py b/paddlenlp/transformers/xlnet/modeling.py index ff1cb2eefda8..608f300db3fc 100644 --- a/paddlenlp/transformers/xlnet/modeling.py +++ b/paddlenlp/transformers/xlnet/modeling.py @@ -74,7 +74,7 @@ def prune_heads(self, heads): @staticmethod def rel_shift_bnij(x, klen=-1): # Relative shift of the attention matrix from bd~ to bd (refer to Appendix B in the Transformer-XL paper) - x_size = paddle.shape(x) + x_size = x.shape x = paddle.reshape(x, [x_size[0], x_size[1], x_size[3], x_size[2]]) x = x[:, :, 1:, :] @@ -104,7 +104,7 @@ def rel_attn_core( # q_head = Exi * Wq; self.r_r_bias = v; k_head_r = Wkr * Rij # b = Exi * Wq * Wkr * Rij; d = v * Wkr * Rij; bd = b + d bd = paddle.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r) - bd = self.rel_shift_bnij(bd, klen=paddle.shape(ac)[3]) + bd = self.rel_shift_bnij(bd, klen=ac.shape[3]) # Segment based attention score if seg_mat is None: @@ -139,7 +139,7 @@ def post_attention(self, h, attn_vec, residual=True): """Post-attention processing.""" # Post-attention projection (back to 'd_model') # Compute einsum4x4("ibnd,hnd->ibh", attn_vec, self.o) - shape = paddle.shape(attn_vec) + shape = attn_vec.shape attn_vec = attn_vec.reshape([shape[0], shape[1], attn_vec.shape[2] * attn_vec.shape[3]]) attn_out = paddle.einsum("ibm,hm->ibh", attn_vec, self.o) @@ -174,31 +174,23 @@ def forward( # Content-based key head # Compute k_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.k) k_head_h = paddle.matmul(cat, self.k) - k_head_h = paddle.reshape( - k_head_h, shape=[paddle.shape(cat)[0], paddle.shape(cat)[1], self.n_head, self.d_head] - ) + k_head_h = paddle.reshape(k_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head]) # Content-based value head # Compute v_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.v) v_head_h = paddle.matmul(cat, self.v) - v_head_h = paddle.reshape( - v_head_h, shape=[paddle.shape(cat)[0], paddle.shape(cat)[1], self.n_head, self.d_head] - ) + v_head_h = paddle.reshape(v_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head]) # Position-based key head # Compute k_head_r = einsum4x4("ibh,h(n*d)->ibnd", r, self.r) k_head_r = paddle.matmul(r, self.r) - k_head_r = paddle.reshape( - k_head_r, shape=[paddle.shape(r)[0], paddle.shape(r)[1], self.n_head, self.d_head] - ) + k_head_r = paddle.reshape(k_head_r, shape=[r.shape[0], r.shape[1], self.n_head, self.d_head]) # H-stream # Content-stream query head # Compute q_head_h = einsum4x4("ibh,h(n*d)->ibnd", h, self.q) q_head_h = paddle.matmul(h, self.q) # shape - q_head_h = paddle.reshape( - q_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head] - ) + q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head]) # Core attention ops attn_vec_h = self.rel_attn_core( @@ -276,26 +268,20 @@ def forward( # Content heads # Compute q_head_h = einsum4x4("ibh,hnd->ibnd", h, self.q) q_head_h = paddle.matmul(h, self.q) - q_head_h = paddle.reshape( - q_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head] - ) + q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head]) # Compute k_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.k) k_head_h = paddle.matmul(cat, self.k) - k_head_h = paddle.reshape( - k_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head] - ) + k_head_h = paddle.reshape(k_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head]) # Compute v_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.v) v_head_h = paddle.matmul(cat, self.v) - v_head_h = paddle.reshape( - v_head_h, shape=[paddle.shape(h)[0], paddle.shape(h)[1], self.n_head, self.d_head] - ) + v_head_h = paddle.reshape(v_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head]) # Position-based key head # Compute k_head_r = einsum4x4("ibh,hnd->ibnd", r, self.r) k_head_r = paddle.matmul(r, self.r) - k_head_r = paddle.reshape(k_head_r, shape=[paddle.shape(k_head_r)[0], -1, self.n_head, self.d_head]) + k_head_r = paddle.reshape(k_head_r, shape=[k_head_r.shape[0], -1, self.n_head, self.d_head]) # Core attention ops attn_vec = self.rel_attn_core( @@ -1003,10 +989,10 @@ def forward( raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_ids = paddle.transpose(input_ids, perm=[1, 0]) - qlen, bsz = paddle.shape(input_ids)[0], paddle.shape(input_ids)[1] + qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0]) - qlen, bsz = paddle.shape(inputs_embeds)[0], paddle.shape(inputs_embeds)[1] + qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") @@ -1016,7 +1002,7 @@ def forward( perm_mask = perm_mask.transpose([1, 2, 0]) if perm_mask is not None else None target_mapping = target_mapping.transpose([1, 2, 0]) if target_mapping is not None else None - mlen = paddle.shape(mems[0])[0] if mems is not None and mems[0] is not None else 0 + mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen # Attention mask @@ -1046,7 +1032,7 @@ def forward( if data_mask is not None: # All mems can be attended to if mlen > 0: - mems_mask = paddle.cast(paddle.zeros([paddle.shape(data_mask)[0], mlen, bsz]), dtype=dtype_float) + mems_mask = paddle.cast(paddle.zeros([data_mask.shape[0], mlen, bsz]), dtype=dtype_float) data_mask = paddle.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = paddle.unsqueeze(data_mask, axis=-1) @@ -1077,7 +1063,7 @@ def forward( output_h = self.dropout(word_emb_k) if target_mapping is not None: - word_emb_q = self.mask_emb.expand([paddle.shape(target_mapping)[0], bsz, -1]) + word_emb_q = self.mask_emb.expand([target_mapping.shape[0], bsz, -1]) output_g = self.dropout(word_emb_q) else: output_g = None @@ -1743,19 +1729,17 @@ def forward( print(reshaped_logits.shape) # [2, 2] """ - num_choices = paddle.shape(input_ids)[1] if input_ids is not None else paddle.shape(inputs_embeds)[1] - input_ids = input_ids.reshape(shape=(-1, paddle.shape(input_ids)[-1])) # flat_input_ids: [bs*num_choice,seq_l] + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1])) # flat_input_ids: [bs*num_choice,seq_l] if attention_mask is not None: - attention_mask = attention_mask.reshape(shape=(-1, paddle.shape(attention_mask)[-1])) + attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1])) if token_type_ids is not None: - token_type_ids = token_type_ids.reshape(shape=(-1, paddle.shape(token_type_ids)[-1])) + token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1])) if inputs_embeds is not None: - inputs_embeds = inputs_embeds.reshape( - shape=(paddle.shape(inputs_embeds)[0], -1, paddle.shape(inputs_embeds)[-1]) - ) + inputs_embeds = inputs_embeds.reshape(shape=(inputs_embeds.shape[0], -1, inputs_embeds.shape[-1])) transformer_outputs = self.transformer( input_ids, @@ -1920,7 +1904,7 @@ def forward( if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/tests/test_tipc/benchmark/modules/bert_for_question_answering.py b/tests/test_tipc/benchmark/modules/bert_for_question_answering.py index d99a81f8898d..1005a0dccea2 100644 --- a/tests/test_tipc/benchmark/modules/bert_for_question_answering.py +++ b/tests/test_tipc/benchmark/modules/bert_for_question_answering.py @@ -75,7 +75,7 @@ def forward(self, model, args, input_data=None, **kwargs): if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = paddle.shape(start_logits)[1] + ignored_index = start_logits.shape[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) diff --git a/tests/test_tipc/transformer/modeling.py b/tests/test_tipc/transformer/modeling.py index 7ecb8849f8fc..c263b5a8b7f7 100644 --- a/tests/test_tipc/transformer/modeling.py +++ b/tests/test_tipc/transformer/modeling.py @@ -446,7 +446,7 @@ def _merge_batch_beams_with_var_dim(self, c): return c def _split_batch_beams_with_var_dim(self, c): - var_dim_size = paddle.shape(c)[self.var_dim_in_state] + var_dim_size = c.shape[self.var_dim_in_state] c = paddle.reshape( c, [-1, self.beam_size] @@ -509,14 +509,14 @@ def step(self, time, inputs, states, **kwargs): if kwargs.get("trg_word", None) is not None: if paddle.in_dynamic_mode(): - if paddle.shape(kwargs.get("trg_word"))[1] > time: + if kwargs.get("trg_word").shape[1] > time: beam_search_output, beam_search_state = self.force_decoding( beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time ) else: def condition(trg_word, time): - return paddle.shape(trg_word)[1] > time + return trg_word.shape[1] > time def default_fn(beam_search_output, beam_search_state): return beam_search_output, beam_search_state @@ -547,8 +547,8 @@ def default_fn(beam_search_output, beam_search_state): return (beam_search_output, beam_search_state, next_inputs, finished) def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time): - batch_size = paddle.shape(beam_search_output.predicted_ids)[0] - beam_size = paddle.shape(beam_search_output.predicted_ids)[1] + batch_size = beam_search_output.predicted_ids.shape[0] + beam_size = beam_search_output.predicted_ids.shape[1] ids_dtype = beam_search_output.predicted_ids.dtype scores_dtype = beam_search_output.scores.dtype @@ -735,8 +735,8 @@ def forward(self, src_word, trg_word): src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]), trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len])) """ - src_max_len = paddle.shape(src_word)[-1] - trg_max_len = paddle.shape(trg_word)[-1] + src_max_len = src_word.shape[-1] + trg_max_len = trg_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) diff --git a/tests/transformer/modeling.py b/tests/transformer/modeling.py index 7ecb8849f8fc..c263b5a8b7f7 100644 --- a/tests/transformer/modeling.py +++ b/tests/transformer/modeling.py @@ -446,7 +446,7 @@ def _merge_batch_beams_with_var_dim(self, c): return c def _split_batch_beams_with_var_dim(self, c): - var_dim_size = paddle.shape(c)[self.var_dim_in_state] + var_dim_size = c.shape[self.var_dim_in_state] c = paddle.reshape( c, [-1, self.beam_size] @@ -509,14 +509,14 @@ def step(self, time, inputs, states, **kwargs): if kwargs.get("trg_word", None) is not None: if paddle.in_dynamic_mode(): - if paddle.shape(kwargs.get("trg_word"))[1] > time: + if kwargs.get("trg_word").shape[1] > time: beam_search_output, beam_search_state = self.force_decoding( beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time ) else: def condition(trg_word, time): - return paddle.shape(trg_word)[1] > time + return trg_word.shape[1] > time def default_fn(beam_search_output, beam_search_state): return beam_search_output, beam_search_state @@ -547,8 +547,8 @@ def default_fn(beam_search_output, beam_search_state): return (beam_search_output, beam_search_state, next_inputs, finished) def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time): - batch_size = paddle.shape(beam_search_output.predicted_ids)[0] - beam_size = paddle.shape(beam_search_output.predicted_ids)[1] + batch_size = beam_search_output.predicted_ids.shape[0] + beam_size = beam_search_output.predicted_ids.shape[1] ids_dtype = beam_search_output.predicted_ids.dtype scores_dtype = beam_search_output.scores.dtype @@ -735,8 +735,8 @@ def forward(self, src_word, trg_word): src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]), trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len])) """ - src_max_len = paddle.shape(src_word)[-1] - trg_max_len = paddle.shape(trg_word)[-1] + src_max_len = src_word.shape[-1] + trg_max_len = trg_word.shape[-1] src_slf_attn_bias = ( paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 ) From 5ce51a3dc769be1ea409c51ce5d80e0cc7c6ccd2 Mon Sep 17 00:00:00 2001 From: Wang Huan Date: Mon, 15 Apr 2024 08:00:37 +0000 Subject: [PATCH 2/3] refine --- paddlenlp/experimental/transformers/llama/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py index a67731916a92..f22eecb15d19 100644 --- a/paddlenlp/experimental/transformers/llama/modeling.py +++ b/paddlenlp/experimental/transformers/llama/modeling.py @@ -451,7 +451,7 @@ def forward( seq_lens=seq_lens, rotary_embs=new_rope, rotary_emb_dims=1, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) hidden_states = self.norm(hidden_states) From 4b044ef09a551edcd98563ee041297dcc4f255fc Mon Sep 17 00:00:00 2001 From: Wang Huan Date: Mon, 15 Apr 2024 08:48:17 +0000 Subject: [PATCH 3/3] refine --- paddlenlp/experimental/transformers/bloom/modeling.py | 2 +- paddlenlp/experimental/transformers/chatglm_v2/modeling.py | 2 +- paddlenlp/experimental/transformers/gpt/modeling.py | 2 +- paddlenlp/experimental/transformers/opt/modeling.py | 2 +- paddlenlp/experimental/transformers/qwen/modeling.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py index fbb983622fef..659826fe6f1b 100644 --- a/paddlenlp/experimental/transformers/bloom/modeling.py +++ b/paddlenlp/experimental/transformers/bloom/modeling.py @@ -279,7 +279,7 @@ def forward( pre_caches=pre_caches, pre_caches_length=position_offset, seq_lens=seq_len, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) # Add last hidden state diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py index 712f03dde2fe..75dd08396398 100644 --- a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py +++ b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py @@ -285,7 +285,7 @@ def forward( seq_lens=seq_lens, rotary_embs=paddle.cast(rotary_pos_emb, "float32"), rotary_emb_dims=1, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) hidden_states = self.final_layernorm(hidden_states) diff --git a/paddlenlp/experimental/transformers/gpt/modeling.py b/paddlenlp/experimental/transformers/gpt/modeling.py index b987c7a70974..6627c9e42abb 100644 --- a/paddlenlp/experimental/transformers/gpt/modeling.py +++ b/paddlenlp/experimental/transformers/gpt/modeling.py @@ -265,7 +265,7 @@ def forward( attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype), caches=cache_kvs, seq_lens=seq_lens, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) hidden_states = self.norm(hidden_states) diff --git a/paddlenlp/experimental/transformers/opt/modeling.py b/paddlenlp/experimental/transformers/opt/modeling.py index 2f8228a75947..afcb1331b52c 100644 --- a/paddlenlp/experimental/transformers/opt/modeling.py +++ b/paddlenlp/experimental/transformers/opt/modeling.py @@ -247,7 +247,7 @@ def forward( seq_lens=seq_lens, rotary_embs=None, rotary_emb_dims=0, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) output = hidden_states diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py index 975bf22abf15..fc6bb92a627d 100644 --- a/paddlenlp/experimental/transformers/qwen/modeling.py +++ b/paddlenlp/experimental/transformers/qwen/modeling.py @@ -340,7 +340,7 @@ def forward( seq_lens=seq_lens, rotary_embs=new_rope, rotary_emb_dims=1, - time_step=paddle.increment(attention_mask.shape[-1], -1) if is_decoder else None, + time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None, ) hidden_states = self.ln_f(hidden_states)